I am learning `pyspark` and for this test, I am using the [May 2022 Tabular Playground Series](https://www.kaggle.com/competitions/tabular-playground-series-may-2022/overview).  

Disclaimer: I am not claiming that this is the best solution for that Series, again I am learning this package.

---

## Downloading and loading packages

In [None]:
!pip install pyspark

In [None]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark import keyword_only
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, StandardScaler, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

from pyspark.sql import DataFrame
from pyspark.ml import Pipeline, Transformer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Exploratory Data Analysis

In [None]:
spark = SparkSession.builder.appName('learning').getOrCreate()

Here, I am not sure if there are benefits on manually creating the input file structure before loading the `csv` file.   

In [None]:
listStruct = []
listStruct.append( StructField('id', IntegerType(), nullable=True) )
for f in [ 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06']:
    listStruct.append( StructField(f, FloatType(), nullable=True) )
for i in [ 'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18' ]:
    listStruct.append( StructField(i, IntegerType(), nullable=True) )
for f in [ 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26' ]:
    listStruct.append( StructField(f, DoubleType(), nullable=True) )
listStruct.append( StructField('f_27', StringType(), nullable=True) )
listStruct.append( StructField('f_28', DoubleType(), nullable=True) )
for i in [ 'f_29', 'f_30' ]:
    listStruct.append( StructField(i, IntegerType(), nullable=True) )

schema_test = StructType(listStruct)

listStruct.append( StructField('target', IntegerType(), nullable=True) )
schema = StructType(listStruct)

df_train = spark.read.csv('/kaggle/input/tabular-playground-series-may-2022/train.csv', header=True, schema=schema)
df_test = spark.read.csv('/kaggle/input/tabular-playground-series-may-2022/test.csv', header=True, schema=schema_test)

In [None]:
df_train.show(2)

In [None]:
df_train.printSchema()

From previous studies, it is known that `f_27` is a string that it is not clear what it means. In the next cells, I tried to see if there are some features I can extract from that variable.

In [None]:
df_27 = df_train.groupby('f_27').count().sort('count', ascending=False)
df_27.show()

Creating a "index" for each type of string in `f_27`.

In [None]:
indexer = StringIndexer( inputCol='f_27', outputCol='f_27_ind' )
indexed = indexer.fit(df_train).transform(df_train)
indexed.show(5)

Let's create one column per character in the string.

In [None]:
split_col = F.split( df_train['f_27'], '')
df_tmp = ( df_train.select([ 'f_'+str(i).zfill(2) for i in range(0, 31) ]+['target'] )
     .withColumn( 'f_27_0', split_col.getItem(0) )
     .withColumn( 'f_27_1', split_col.getItem(1) )
     .withColumn( 'f_27_2', split_col.getItem(2) )
     .withColumn( 'f_27_3', split_col.getItem(3) )
     .withColumn( 'f_27_4', split_col.getItem(4) )
     .withColumn( 'f_27_5', split_col.getItem(5) )
     .withColumn( 'f_27_6', split_col.getItem(6) )
     .withColumn( 'f_27_7', split_col.getItem(7) )
     .withColumn( 'f_27_8', split_col.getItem(8) )
     .withColumn( 'f_27_9', split_col.getItem(9) )
#      .show() 
)
df_tmp.show(5)

Checking the number of distintic characters per new columns:

In [None]:
for i in range(0, 10):
    df_tmp.groupby(f'f_27_{i}').count().show()

In [None]:
# OHE = OneHotEncoder( inputCols=['f_27_0'], outputCols=['f_27_0_OHE'] )
# OHE = VectorIndexer( inputCol='f_27_0', outputCol='f_27_0_OHE' )
# OHE.fit(df_tmp)
tmp_String = StringIndexer( inputCol='f_27_0', outputCol='f_27_0_SI' )
tmp_String.fit(df_tmp).transform(df_tmp)

Let's make some basic plots:

In [None]:
listOfInts = [ f.name for f in df_tmp.schema.fields if isinstance(f.dataType, IntegerType) ]
listOfFloats = [ f.name for f in df_tmp.schema.fields if isinstance(f.dataType, FloatType) ]

for f in listOfInts: df_tmp.select(f).toPandas().hist()

## Pipelines and Classification

In this part, I want it to check pipelines in `pyspark` for feature extraction and the classification process.

In [None]:
train_data, test_data = df_train.randomSplit([.8,.2], seed=42)

Creating a transformer for the `f_27` column. This is only needed because it is part of the pipeline

In [None]:
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

class splitColumns(Transformer):
    inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString)
    @keyword_only
    def __init__(self, inputCol: str = 'input'):
        super(splitColumns, self).__init__()
        self._setDefault(inputCol=None)
        kwargs = self._input_kwargs
        self.set_params(**kwargs)
        
    @keyword_only
    def set_params(self, inputCol: str = "input"):
        kwargs = self._input_kwargs
        self._set(**kwargs)
        
    def get_input_col(self):
        return self.getOrDefault(self.inputCol)
  
    def _transform(self, df: DataFrame) -> DataFrame:
        inCol = self.get_input_col()
        split_col = F.split( df[inCol], '')
        return ( df.withColumn( f'{inCol}_0', split_col.getItem(0) )
                 .withColumn( f'{inCol}_1', split_col.getItem(1) )
                 .withColumn( f'{inCol}_2', split_col.getItem(2) )
                 .withColumn( f'{inCol}_3', split_col.getItem(3) )
                 .withColumn( f'{inCol}_4', split_col.getItem(4) )
                 .withColumn( f'{inCol}_5', split_col.getItem(5) )
                 .withColumn( f'{inCol}_6', split_col.getItem(6) )
                 .withColumn( f'{inCol}_7', split_col.getItem(7) )
                 .withColumn( f'{inCol}_8', split_col.getItem(8) )
                 .withColumn( f'{inCol}_9', split_col.getItem(9) )
             )

# sc = splitColumns(inputCol='f_27')
# sc.transform(df_train).show()

Creating a list of features and stages for the pipeline:

In [None]:
%%time
listOfInts = [ f.name for f in df_tmp.schema.fields if isinstance(f.dataType, IntegerType) ]
listOfFloats = [ f.name for f in df_tmp.schema.fields if isinstance(f.dataType, FloatType) ]
listOfNumbers = listOfInts + listOfFloats

listOfStages = []
listOfStages += [ VectorAssembler(inputCols=[i], outputCol=f'{i}_vec') for i in listOfNumbers ]
listOfStages += [ StandardScaler(inputCol=f'{i}_vec', outputCol=f'{i}_scaled') for i in listOfNumbers ]

# listOfStages += [ splitColumns(inputCol='f_27') ]
# listOfStages += [ StringIndexer( inputCol='f_27', outputCol='f_27_ind' ) ]
# listOfStages += [ StringIndexer( inputCol=f'f_27_{i}', outputCol=f'f_27_{i}_ind' ) for i in range(0,10) ]
# listOfStages += [ VectorAssembler(inputCols=[f'f_27_{i}_ind'], outputCol=f'f_27_{i}_vec') for i in range(0,10) ]
# listOfStages += [ StandardScaler(inputCol=f'f_27_{i}_vec', outputCol=f'f_27_{i}_scaled') for i in range(0,10) ]

listFinalFeatures = [ f'{i}_scaled' for i in listOfNumbers ] #+ [ f'f_27_{i}_scaled' for i in range(0,10) ]
listOfStages += [ VectorAssembler( inputCols=listFinalFeatures , outputCol='features' ) ]
# listOfStages += [ VectorAssembler( inputCols=['target'] , outputCol='indexedLabel' ) ]

listOfStages += [ LogisticRegression( featuresCol='features', labelCol='target' ) ]
# listOfStages += [ GBTClassifier( featuresCol='features', labelCol='target', maxIter=100 ) ]

pipeline = Pipeline( stages=listOfStages )
model = pipeline.fit(train_data)

In [None]:
# val_data = model.transform( test_data )
val_data = model.transform( train_data )
# val_data.show(5)

In [None]:
mcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC", labelCol='target')
# mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy", labelCol='target')
print( mcEvaluator.evaluate(val_data) )
# print("Area under ROC = %s" % metrics.areaUnderROC)

In [None]:
df_test = df_test.withColumn('target', F.lit(0))
prediction = model.transform( df_test )

In [None]:
df_pred = prediction.select('id', 'prediction').toPandas()
df_pred.head()

In [None]:
df_pred.rename(columns={'prediction':'target'}, inplace=True )
df_pred.to_csv('submission.csv', index=False)