# StateFarm Code Screen 

> This is Sungryong Hong. I will use both of **python** and **pyspark** to solve this problem. 

> For spark calculations, I have used my hand-installed spark cluster with a hadoop `hdfs` file system. I am also using Dataprc in Google Cloud Platform occasionally. 

## 1. Import basic libraries

In [1]:
# Basic Libraries 
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
import gc


pd.set_option('display.max_rows', 500)
pd.options.mode.chained_assignment = None
#pd.set_option("display.precision", 10)

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

In [2]:
# Basic PySpark Libraries

# Old Style : SparkContext 
#from pyspark import SparkContext   
#from pyspark.sql import SQLContext


# New Style : Spark Session  
#Shell-Mode: Spark Session Name is `spark`

sc = spark.sparkContext
sqlsc = SQLContext(sc)
sc.setCheckpointDir("hdfs://master:54310/tmp/spark/checkpoints")

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W

In [3]:
# Enable Arrow for boosting up python performances 
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

## 2. Read and Explore the Data

In [4]:
# Read data
rawdf = pd.read_csv("./exercise_01_train.csv", low_memory=False)

In [5]:
rawdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 101 entries, x0 to y
dtypes: float64(94), int64(1), object(6)
memory usage: 30.8+ MB


In [6]:
rawdf.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
x0,10.1429,-52.2146,67.7185,-28.0031,80.703,17.0887,-30.5571,37.8009,83.8172,-13.734
x1,-15.6756,5.84713,2.06433,8.56513,30.7364,4.08712,0.971707,-11.0771,-1.65298,47.1342
x2,3.58318,-10.9028,12.3942,-8.59209,-30.1019,23.4681,29.9701,-10.472,-30.4228,-11.9095
x3,-22.3975,-14.1324,-18.6671,5.91896,-21.2011,-12.1707,-3.87861,14.5568,8.3319,-2.7833
x4,27.2219,20.5886,47.4655,-3.22415,-91.9462,5.89716,-24.7352,18.1448,19.4287,-35.3579
x5,-34.1109,36.1073,-50.3737,78.3158,-47.4692,46.0977,95.211,-1.99682,-26.8725,72.5116
x6,-0.0728286,0.115023,0.253707,-0.879845,-0.646831,-0.528155,2.84339,-1.05794,0.337466,-0.627806
x7,-0.544444,0.276093,1.06897,1.17689,-0.578398,-1.79626,-1.65296,-1.45359,-3.27356,3.80197
x8,0.997601,-0.699168,2.93971,-2.41475,0.980849,3.76375,-0.0553228,3.48818,-2.8436,-1.15852
x9,-2.69178,-0.972708,2.69122,0.589646,-1.42611,-1.49291,-1.65079,-3.46883,0.684401,1.91101


> `x34` car maker; `x35` days of the week; `x41` in dollars; `x45` in percentages; `x68` months; `x93` area; 

> `x41` and `x45` are still numerical. Only `x34`, `x35`, `x68`, `x93` are categorical. 

In [7]:
rawdf.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x0,39986.0,8.259955,38.374182,-140.780478,-17.800204,8.354662,33.82978,177.399176
x1,39990.0,-3.249786,15.171131,-64.493908,-13.45858,-3.386601,6.881661,62.906822
x2,39994.0,1.030666,24.732185,-105.388182,-15.565461,1.132995,17.677615,99.394915
x3,39990.0,-0.747566,15.22573,-63.804916,-11.078276,-0.714888,9.552404,59.338352
x4,39994.0,0.28382,42.240018,-158.195975,-28.246509,0.292788,28.719663,179.342581
x5,39990.0,-1.77351,42.1241,-169.237259,-30.391354,-1.753365,26.844781,170.894497
x6,39993.0,-0.000232,1.065955,-4.13349,-0.723098,0.001105,0.715844,5.311653
x7,39987.0,-0.016107,3.382644,-12.96697,-2.299081,-0.003556,2.259972,16.619445
x8,39994.0,-0.651093,2.947472,-12.037625,-2.628856,-0.659223,1.322101,14.994937
x9,39993.0,-0.014688,1.906496,-7.4462,-1.299759,-0.02817,1.263469,7.300186


In [8]:
rawdf.dtypes

x0     float64
x1     float64
x2     float64
x3     float64
x4     float64
x5     float64
x6     float64
x7     float64
x8     float64
x9     float64
x10    float64
x11    float64
x12    float64
x13    float64
x14    float64
x15    float64
x16    float64
x17    float64
x18    float64
x19    float64
x20    float64
x21    float64
x22    float64
x23    float64
x24    float64
x25    float64
x26    float64
x27    float64
x28    float64
x29    float64
x30    float64
x31    float64
x32    float64
x33    float64
x34     object
x35     object
x36    float64
x37    float64
x38    float64
x39    float64
x40    float64
x41     object
x42    float64
x43    float64
x44    float64
x45     object
x46    float64
x47    float64
x48    float64
x49    float64
x50    float64
x51    float64
x52    float64
x53    float64
x54    float64
x55    float64
x56    float64
x57    float64
x58    float64
x59    float64
x60    float64
x61    float64
x62    float64
x63    float64
x64    float64
x65    float64
x66    flo

In [9]:
rawdf.isnull().sum(axis=0)

x0     14
x1     10
x2      6
x3     10
x4      6
x5     10
x6      7
x7     13
x8      6
x9      7
x10     8
x11     9
x12    11
x13    13
x14     3
x15     8
x16     8
x17    11
x18    14
x19     8
x20     6
x21    13
x22     5
x23     9
x24    11
x25     8
x26    11
x27     8
x28    11
x29     5
x30     5
x31     8
x32     4
x33     9
x34     7
x35    10
x36     7
x37     3
x38     6
x39     8
x40     6
x41     5
x42    12
x43     1
x44     4
x45     5
x46     7
x47     4
x48    10
x49     6
x50     5
x51    11
x52     8
x53     4
x54     6
x55    14
x56    11
x57     8
x58     7
x59     9
x60    12
x61     6
x62    13
x63    13
x64     5
x65    12
x66    10
x67     8
x68     9
x69    13
x70     4
x71     4
x72     8
x73     6
x74     7
x75    10
x76     7
x77    10
x78     7
x79     9
x80     7
x81     5
x82     7
x83     5
x84     3
x85    12
x86     9
x87     7
x88     4
x89    10
x90     7
x91     2
x92     6
x93     7
x94    11
x95     7
x96    15
x97     7
x98     5
x99    13


### 2.1 Convert `Dollar` and `Percentage` to Numericals

In [10]:
rawdf[['x41','x45']].head(5)

Unnamed: 0,x41,x45
0,$-54.1,0.0%
1,$-229.32,0.01%
2,$243.68,-0.01%
3,$126.15,0.02%
4,$877.39,-0.02%


In [11]:
rawdf[['x41','x45']].dtypes

x41    object
x45    object
dtype: object

In [12]:
%%time
rawdf['x41n'] = rawdf['x41'].apply(lambda x: np.double(str(x).strip('$')))

CPU times: user 156 ms, sys: 15.7 ms, total: 172 ms
Wall time: 42 ms


In [13]:
%%time
rawdf['x45n'] = rawdf['x45'].apply(lambda x: np.double(str(x).strip('%')))

CPU times: user 131 ms, sys: 5.63 ms, total: 136 ms
Wall time: 34 ms


In [14]:
rawdf[['x41','x45','x41n','x45n']].head(5)

Unnamed: 0,x41,x45,x41n,x45n
0,$-54.1,0.0%,-54.1,0.0
1,$-229.32,0.01%,-229.32,0.01
2,$243.68,-0.01%,243.68,-0.01
3,$126.15,0.02%,126.15,0.02
4,$877.39,-0.02%,877.39,-0.02


In [15]:
rawdf[['x41','x45','x41n','x45n']].dtypes

x41      object
x45      object
x41n    float64
x45n    float64
dtype: object

### 2.2 Trim the columns to `numericals` and `categoricals`

In [16]:
rawdf.columns

Index([u'x0', u'x1', u'x2', u'x3', u'x4', u'x5', u'x6', u'x7', u'x8', u'x9',
       ...
       u'x93', u'x94', u'x95', u'x96', u'x97', u'x98', u'x99', u'y', u'x41n',
       u'x45n'],
      dtype='object', length=103)

In [17]:
dtypelist = rawdf.dtypes

#### Numerical Columns 

In [18]:
listNumerical = list(dtypelist[dtypelist.values == 'float64'].index)

In [19]:
rawdf[listNumerical[:3]].head()

Unnamed: 0,x0,x1,x2
0,10.142889,-15.67562,3.583176
1,-52.21463,5.847135,-10.902843
2,67.7185,2.064334,12.394186
3,-28.003111,8.565128,-8.592092
4,80.703016,30.736353,-30.101857


In [20]:
len(listNumerical)

96

#### Categorical Columns

> We already know that we only have four categorical columns, `['x34','x35','x68','x93']`

In [21]:
listCategorical = ['x34','x35','x68','x93']

In [22]:
rawdf[listCategorical].dtypes

x34    object
x35    object
x68    object
x93    object
dtype: object

In [23]:
rawdf[listCategorical].head(3)

Unnamed: 0,x34,x35,x68,x93
0,bmw,wed,Jun,euorpe
1,nissan,thur,July,asia
2,Honda,wed,July,asia


In [24]:
rawdf.x34.value_counts(dropna=False)

volkswagon    12572
Toyota        10946
bmw            7304
Honda          5129
tesla          2275
chrystler      1223
nissan          336
ford            165
mercades         32
chevrolet        11
NaN               7
Name: x34, dtype: int64

#### Quick and Dirty Imputings

> only tiny fractions are `nan`. Imputings are barely critical in ML performances, at least, for this problem

In [25]:
rawdf[listCategorical] = rawdf[listCategorical].fillna('others')

In [26]:
rawdf.x34.value_counts(dropna=False)

volkswagon    12572
Toyota        10946
bmw            7304
Honda          5129
tesla          2275
chrystler      1223
nissan          336
ford            165
mercades         32
chevrolet        11
others            7
Name: x34, dtype: int64

In [27]:
rawdf[listNumerical] = rawdf[listNumerical].fillna(rawdf[listNumerical].median())

In [28]:
rawdf[listNumerical].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x0,40000.0,8.259989,38.367466,-140.780478,-17.787105,8.354662,33.813323,177.399176
x1,40000.0,-3.24982,15.169234,-64.493908,-13.457618,-3.386601,6.875814,62.906822
x2,40000.0,1.030682,24.73033,-105.388182,-15.561986,1.132995,17.676434,99.394915
x3,40000.0,-0.747558,15.223827,-63.804916,-11.074591,-0.714888,9.548938,59.338352
x4,40000.0,0.283822,42.23685,-158.195975,-28.238695,0.292788,28.71909,179.342581
x5,40000.0,-1.773505,42.118834,-169.237259,-30.384131,-1.753365,26.825469,170.894497
x6,40000.0,-0.000232,1.065862,-4.13349,-0.722887,0.001105,0.715731,5.311653
x7,40000.0,-0.016103,3.382095,-12.96697,-2.298524,-0.003556,2.259208,16.619445
x8,40000.0,-0.651094,2.947251,-12.037625,-2.628735,-0.659223,1.322,14.994937
x9,40000.0,-0.01469,1.906329,-7.4462,-1.299284,-0.02817,1.263201,7.300186


### Now Using Apache Spark ... 

#### Define a spark dataframe from the pandasDF

In [29]:
['y']+listCategorical

['y', 'x34', 'x35', 'x68', 'x93']

In [30]:
df = spark.createDataFrame(rawdf[listNumerical+listCategorical+['y']])



In [31]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
x0,40000,8.259988610456132,38.36746564399919,-140.78047827352268,177.3991760093044
x1,40000,-3.2498204298560536,15.169234465853046,-64.49390831526418,62.90682249109385
x2,40000,1.0306815991544986,24.730329924581977,-105.38818157297241,99.39491534246905
x3,40000,-0.747558324815871,15.22382657930609,-63.80491552692011,59.338351541989795
x4,40000,0.28382160067535134,42.23684992469813,-158.19597508401924,179.34258123817855
x5,40000,-1.77350490776339,42.11883385124156,-169.2372592701479,170.89449680005063
x6,40000,-2.3193041220587193E-4,1.0658617492856846,-4.133489975080877,5.311652881034644
x7,40000,-0.016102820179740553,3.3820947131083776,-12.966969584835555,16.61944546842101
x8,40000,-0.6510942377468463,2.947251379772876,-12.037624904979749,14.994937090343942


### 2.3 Ecode the `categoricals` using `StringIndexers`

In [32]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator

In [33]:
indexersCategorical = \
    [StringIndexer(inputCol=eachcat, outputCol="{0}_indexed".format(eachcat), \
                   handleInvalid='keep') \
     for eachcat in listCategorical]

In [34]:
print [eachindexer.getOutputCol() for eachindexer in indexersCategorical]

['x34_indexed', 'x35_indexed', 'x68_indexed', 'x93_indexed']


In [35]:
listCategoricalIndexed = [eachindexer.getOutputCol() for eachindexer in indexersCategorical]

#### Sanity Check for StringIndexer

In [36]:
for idx in range(len(listCategorical)):
    indexersCategorical[idx]\
        .fit(df).transform(df)\
        .select('x0',listCategorical[idx],indexersCategorical[idx].getOutputCol())\
        .show(8)

+-------------------+----------+-----------+
|                 x0|       x34|x34_indexed|
+-------------------+----------+-----------+
| 10.142889218659404|       bmw|        2.0|
| -52.21463015787183|    nissan|        6.0|
|   67.7185001029058|     Honda|        3.0|
|-28.003111353304345|    Toyota|        1.0|
|  80.70301597973435|       bmw|        2.0|
| 17.088739495473188|    Toyota|        1.0|
|-30.557123896244413|    Toyota|        1.0|
|  37.80088857199077|volkswagon|        0.0|
+-------------------+----------+-----------+
only showing top 8 rows

+-------------------+-------+-----------+
|                 x0|    x35|x35_indexed|
+-------------------+-------+-----------+
| 10.142889218659404|    wed|        0.0|
| -52.21463015787183|   thur|        3.0|
|   67.7185001029058|    wed|        0.0|
|-28.003111353304345|   thur|        3.0|
|  80.70301597973435|thurday|        1.0|
| 17.088739495473188|    wed|        0.0|
|-30.557123896244413|    wed|        0.0|
|  37.800888571

#### Now run the stages of StringIndexers

In [37]:
%%time
pipeline = Pipeline(stages=indexersCategorical)
numdf = pipeline.fit(df).transform(df)

CPU times: user 30.5 ms, sys: 7.48 ms, total: 38 ms
Wall time: 2.16 s


In [38]:
numdf.cache()

DataFrame[x0: double, x1: double, x2: double, x3: double, x4: double, x5: double, x6: double, x7: double, x8: double, x9: double, x10: double, x11: double, x12: double, x13: double, x14: double, x15: double, x16: double, x17: double, x18: double, x19: double, x20: double, x21: double, x22: double, x23: double, x24: double, x25: double, x26: double, x27: double, x28: double, x29: double, x30: double, x31: double, x32: double, x33: double, x36: double, x37: double, x38: double, x39: double, x40: double, x42: double, x43: double, x44: double, x46: double, x47: double, x48: double, x49: double, x50: double, x51: double, x52: double, x53: double, x54: double, x55: double, x56: double, x57: double, x58: double, x59: double, x60: double, x61: double, x62: double, x63: double, x64: double, x65: double, x66: double, x67: double, x69: double, x70: double, x71: double, x72: double, x73: double, x74: double, x75: double, x76: double, x77: double, x78: double, x79: double, x80: double, x81: double,

In [39]:
df.select('x93').groupby('x93').count().sort(F.desc("count")).show()

+-------+-----+
|    x93|count|
+-------+-----+
|   asia|35409|
|america| 3136|
| euorpe| 1448|
| others|    7|
+-------+-----+



In [40]:
numdf.select('x93_indexed').groupby('x93_indexed').count().sort(F.desc("count")).show()

+-----------+-----+
|x93_indexed|count|
+-----------+-----+
|        0.0|35409|
|        1.0| 3136|
|        2.0| 1448|
|        3.0|    7|
+-----------+-----+



#### One-hot Encoding the indexed categoricals 

In [41]:
encoder = OneHotEncoderEstimator(inputCols=listCategoricalIndexed,\
                                 outputCols=[eachcol+ "_onehot" for eachcol in listCategoricalIndexed])

In [42]:
listCategoricalOneHot = encoder.getOutputCols()

In [43]:
listCategoricalOneHot

['x34_indexed_onehot',
 'x35_indexed_onehot',
 'x68_indexed_onehot',
 'x93_indexed_onehot']

In [44]:
%%time
numdf = encoder.fit(numdf).transform(numdf)

CPU times: user 6.99 ms, sys: 1.78 ms, total: 8.77 ms
Wall time: 95.9 ms


In [45]:
numdf.select('x34','x34_indexed','x34_indexed_onehot').show(5)

+------+-----------+------------------+
|   x34|x34_indexed|x34_indexed_onehot|
+------+-----------+------------------+
|   bmw|        2.0|    (11,[2],[1.0])|
|nissan|        6.0|    (11,[6],[1.0])|
| Honda|        3.0|    (11,[3],[1.0])|
|Toyota|        1.0|    (11,[1],[1.0])|
|   bmw|        2.0|    (11,[2],[1.0])|
+------+-----------+------------------+
only showing top 5 rows



In [46]:
numdf.cache()

DataFrame[x0: double, x1: double, x2: double, x3: double, x4: double, x5: double, x6: double, x7: double, x8: double, x9: double, x10: double, x11: double, x12: double, x13: double, x14: double, x15: double, x16: double, x17: double, x18: double, x19: double, x20: double, x21: double, x22: double, x23: double, x24: double, x25: double, x26: double, x27: double, x28: double, x29: double, x30: double, x31: double, x32: double, x33: double, x36: double, x37: double, x38: double, x39: double, x40: double, x42: double, x43: double, x44: double, x46: double, x47: double, x48: double, x49: double, x50: double, x51: double, x52: double, x53: double, x54: double, x55: double, x56: double, x57: double, x58: double, x59: double, x60: double, x61: double, x62: double, x63: double, x64: double, x65: double, x66: double, x67: double, x69: double, x70: double, x71: double, x72: double, x73: double, x74: double, x75: double, x76: double, x77: double, x78: double, x79: double, x80: double, x81: double,

### 2.4 Vectorize the features

In [47]:
from pyspark.ml.feature import VectorAssembler

In [48]:
vectorizedFeatures = listNumerical+listCategoricalOneHot

In [49]:
len(vectorizedFeatures)

100

In [50]:
print vectorizedFeatures

['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30', 'x31', 'x32', 'x33', 'x36', 'x37', 'x38', 'x39', 'x40', 'x42', 'x43', 'x44', 'x46', 'x47', 'x48', 'x49', 'x50', 'x51', 'x52', 'x53', 'x54', 'x55', 'x56', 'x57', 'x58', 'x59', 'x60', 'x61', 'x62', 'x63', 'x64', 'x65', 'x66', 'x67', 'x69', 'x70', 'x71', 'x72', 'x73', 'x74', 'x75', 'x76', 'x77', 'x78', 'x79', 'x80', 'x81', 'x82', 'x83', 'x84', 'x85', 'x86', 'x87', 'x88', 'x89', 'x90', 'x91', 'x92', 'x94', 'x95', 'x96', 'x97', 'x98', 'x99', 'x41n', 'x45n', 'x34_indexed_onehot', 'x35_indexed_onehot', 'x68_indexed_onehot', 'x93_indexed_onehot']


In [51]:
vecAssem = VectorAssembler(inputCols = vectorizedFeatures, outputCol= "features")

In [52]:
mldata = vecAssem.transform(numdf).select(vectorizedFeatures+['y','features'])

In [53]:
mldata.cache()

DataFrame[x0: double, x1: double, x2: double, x3: double, x4: double, x5: double, x6: double, x7: double, x8: double, x9: double, x10: double, x11: double, x12: double, x13: double, x14: double, x15: double, x16: double, x17: double, x18: double, x19: double, x20: double, x21: double, x22: double, x23: double, x24: double, x25: double, x26: double, x27: double, x28: double, x29: double, x30: double, x31: double, x32: double, x33: double, x36: double, x37: double, x38: double, x39: double, x40: double, x42: double, x43: double, x44: double, x46: double, x47: double, x48: double, x49: double, x50: double, x51: double, x52: double, x53: double, x54: double, x55: double, x56: double, x57: double, x58: double, x59: double, x60: double, x61: double, x62: double, x63: double, x64: double, x65: double, x66: double, x67: double, x69: double, x70: double, x71: double, x72: double, x73: double, x74: double, x75: double, x76: double, x77: double, x78: double, x79: double, x80: double, x81: double,

In [54]:
mldata.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[10.1428892186594...|
|[-52.214630157871...|
|[67.7185001029058...|
|[-28.003111353304...|
|[80.7030159797343...|
+--------------------+
only showing top 5 rows



#### Save this as a parquet table and done

In [55]:
import pyarrow as pa
import pyarrow.parquet as pq

In [56]:
mldata.write.option("compression","snappy")\
      .mode("overwrite").save("hdfs://master:54310/data/spark/statefarm/mldata.parquet.snappy")