In [47]:
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import  StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.linalg import Vectors
#from pyspark.ml.datatypes import  StringType , IntegerType
from pyspark.sql.types import *



spark = SparkSession.builder.appName('ml-bank').getOrCreate()

df = spark.read.csv('ibm-hr-analytics-attrition.csv', header = True, inferSchema = True)
df = df.selectExpr("PerformanceRating as label","Age","Attrition","JobSatisfaction","OverTime","Gender")
df.count()

1470

In [53]:
df = df.na.drop()
df.toPandas()

Unnamed: 0,label,Age,Attrition,JobSatisfaction,OverTime,Gender
0,3,41,Yes,4,Yes,Female
1,4,49,No,2,No,Male
2,3,37,Yes,3,Yes,Male
3,3,33,No,3,Yes,Female
4,3,27,No,2,No,Male
...,...,...,...,...,...,...
1465,3,36,No,4,No,Male
1466,3,39,No,1,No,Male
1467,4,27,No,2,Yes,Male
1468,3,49,No,2,No,Male


In [66]:
categorical_variables = [ "Attrition",  "OverTime", "Gender"]
numerical_variables = [ "Age" , "JobSatisfaction"] #

indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) 
    for c in categorical_variables
]
encoders = [
    OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]

feats = [encoder.getOutputCol() for encoder in encoders] + numerical_variables
assembler = VectorAssembler(inputCols= feats , outputCol="features")

scaler =  MinMaxScaler(inputCol = "features" , outputCol= "features_scaled")


stages = indexers + encoders  + [assembler] + [scaler]

pipline = Pipeline(stages = stages)
feats

['Attrition_indexed_encoded',
 'OverTime_indexed_encoded',
 'Gender_indexed_encoded',
 'Age',
 'JobSatisfaction']

In [83]:
# from pyspark.ml.classification import MultilayerPerceptronClassifier

# layers = [64, 32, 16, 4]
# lr = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
lr = LogisticRegression(featuresCol = 'features_scaled', labelCol = 'label', maxIter=100)


In [84]:
model_preprocessor = pipline.fit(df)

In [85]:
train, test = df.randomSplit([0.8,0.2],seed=12345)

In [86]:
model_train_data  = model_preprocessor.transform(train)

In [87]:
model_predictor = lr.fit(model_train_data)

In [88]:
# Test
model_test_data = model_preprocessor.transform(test)
test_results = model_predictor.transform(model_test_data)
test_results.toPandas()

Unnamed: 0,label,Age,Attrition,JobSatisfaction,OverTime,Gender,Attrition_indexed,OverTime_indexed,Gender_indexed,Attrition_indexed_encoded,OverTime_indexed_encoded,Gender_indexed_encoded,features,features_scaled,rawPrediction,probability,prediction
0,3,18,Yes,3,No,Male,1.0,0.0,0.0,(0.0),(1.0),(1.0),"[0.0, 1.0, 1.0, 18.0, 3.0]","[0.0, 1.0, 1.0, 0.0, 0.6666666666666666]","[-28.01859706305537, -28.01859706305537, -28.0...","[1.3678491077720957e-31, 1.3678491077720957e-3...",3.0
1,3,21,No,1,Yes,Male,0.0,1.0,0.0,(1.0),(0.0),(1.0),"[1.0, 0.0, 1.0, 21.0, 1.0]","[1.0, 0.0, 1.0, 0.07142857142857142, 0.0]","[-29.423541793215723, -29.423541793215723, -29...","[4.125716350959469e-33, 4.125716350959469e-33,...",3.0
2,3,21,No,2,Yes,Female,0.0,1.0,1.0,(1.0),(0.0),(0.0),"[1.0, 0.0, 0.0, 21.0, 2.0]","[1.0, 0.0, 0.0, 0.07142857142857142, 0.3333333...","[-26.698602845461245, -26.698602845461245, -26...","[3.7074557926132165e-30, 3.7074557926132165e-3...",3.0
3,3,21,No,4,No,Male,0.0,0.0,0.0,(1.0),(1.0),(1.0),"[1.0, 1.0, 1.0, 21.0, 4.0]","[1.0, 1.0, 1.0, 0.07142857142857142, 1.0]","[-49.622095651579386, -49.622095651579386, -49...","[4.789439846780191e-55, 4.789439846780191e-55,...",3.0
4,3,22,No,2,No,Female,0.0,0.0,1.0,(1.0),(1.0),(0.0),"[1.0, 1.0, 0.0, 22.0, 2.0]","[1.0, 1.0, 0.0, 0.09523809523809523, 0.3333333...","[-36.22930863632412, -36.22930863632412, -36.2...","[1.6441064801095029e-40, 1.6441064801095029e-4...",3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,4,51,No,2,Yes,Male,0.0,1.0,0.0,(1.0),(0.0),(1.0),"[1.0, 0.0, 1.0, 51.0, 2.0]","[1.0, 0.0, 1.0, 0.7857142857142857, 0.33333333...","[-50.5841965455224, -50.5841965455224, -50.584...","[4.351398036488448e-56, 4.351398036488448e-56,...",3.0
268,4,51,No,4,No,Male,0.0,0.0,0.0,(1.0),(1.0),(1.0),"[1.0, 1.0, 1.0, 51.0, 4.0]","[1.0, 1.0, 1.0, 0.7857142857142857, 1.0]","[-67.03334274699233, -67.03334274699233, -67.0...","[5.9452259157148835e-74, 5.9452259157148835e-7...",3.0
269,4,51,No,4,Yes,Male,0.0,1.0,0.0,(1.0),(0.0),(1.0),"[1.0, 0.0, 1.0, 51.0, 4.0]","[1.0, 0.0, 1.0, 0.7857142857142857, 1.0]","[-58.083011859309885, -58.083011859309885, -58...","[3.1410046043978795e-64, 3.1410046043978795e-6...",3.0
270,4,54,No,4,No,Female,0.0,0.0,1.0,(1.0),(1.0),(0.0),"[1.0, 1.0, 0.0, 54.0, 4.0]","[1.0, 1.0, 0.0, 0.8571428571428571, 1.0]","[-62.3001208518854, -62.3001208518854, -62.300...","[8.087617076224434e-69, 8.087617076224434e-69,...",3.0


In [90]:
labels =  test_results.toPandas()['label'].values
labels = np.array(labels)

pred_values = test_results.toPandas()['prediction'].values
pred_values = np.array(pred_values)

errr = np.sqrt(labels - pred_values).mean()
errr

0.16176470588235295

In [110]:
item =  df.limit(1)
item.toPandas()

Unnamed: 0,label,Age,Attrition,JobSatisfaction,OverTime,Gender
0,3,41,Yes,4,Yes,Female


In [111]:
item = item.drop('label')
item.toPandas()

Unnamed: 0,Age,Attrition,JobSatisfaction,OverTime,Gender
0,41,Yes,4,Yes,Female


In [112]:
item_pre  = model_preprocessor.transform(item)
result = model_predictor.transform(item_pre)
result.toPandas()

Unnamed: 0,Age,Attrition,JobSatisfaction,OverTime,Gender,Attrition_indexed,OverTime_indexed,Gender_indexed,Attrition_indexed_encoded,OverTime_indexed_encoded,Gender_indexed_encoded,features,features_scaled,rawPrediction,probability,prediction
0,41,Yes,4,Yes,Female,1.0,1.0,1.0,(0.0),(0.0),(0.0),"(0.0, 0.0, 0.0, 41.0, 4.0)","(0.0, 0.0, 0.0, 0.5476190476190476, 1.0)","[-29.691950000768376, -29.691950000768376, -29...","[2.0787571786096947e-33, 2.0787571786096947e-3...",3.0


In [None]:
result = pipeline.fit(item).transform(item)
result