In [68]:
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import  StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.linalg import Vectors
#from pyspark.ml.datatypes import  StringType , IntegerType
from pyspark.sql.types import *



spark = SparkSession.builder.appName('ml-bank').getOrCreate()
df = spark.read.csv('ibm-hr-analytics-attrition.csv', header = True, inferSchema = True)

df = df.selectExpr("PerformanceRating as label","Age","Attrition","JobSatisfaction","OverTime","Gender")
df.count()

1470

In [69]:
df = df.na.drop()
df.toPandas()

Unnamed: 0,label,Age,Attrition,JobSatisfaction,OverTime,Gender
0,3,41,Yes,4,Yes,Female
1,4,49,No,2,No,Male
2,3,37,Yes,3,Yes,Male
3,3,33,No,3,Yes,Female
4,3,27,No,2,No,Male
...,...,...,...,...,...,...
1465,3,36,No,4,No,Male
1466,3,39,No,1,No,Male
1467,4,27,No,2,Yes,Male
1468,3,49,No,2,No,Male


In [13]:
categorical_variables = [ "Attrition",  "OverTime", "Gender"]

indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables]
encoder = OneHotEncoder(inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers])
assembler = VectorAssembler(inputCols=encoder.getOutputCols(), outputCol="features")
scaler = MinMaxScaler(inputCol="features", outputCol="scaled-features")
lr = LogisticRegression(featuresCol = 'scaled-features', labelCol = 'label', maxIter=100)


model = Pipeline(stages= indexers + [encoder, assembler , scaler , lr] )

In [14]:
train, test = df.randomSplit([0.7,0.3],seed=12345)
train.printSchema()

root
 |-- label: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- OverTime: string (nullable = true)
 |-- Gender: string (nullable = true)



In [17]:
model = model.fit(train)


In [40]:
result = model.transform(test) 
result.toPandas()

Unnamed: 0,label,Age,Attrition,JobSatisfaction,OverTime,Gender,Attrition-index,OverTime-index,Gender-index,Attrition-index-encoded,OverTime-index-encoded,Gender-index-encoded,features,scaled-features,rawPrediction,probability,prediction
0,3,18,Yes,3,No,Male,1.0,0.0,0.0,(0.0),(1.0),(1.0),"[0.0, 1.0, 1.0]","[0.0, 1.0, 1.0]","[-44.53508413498451, -44.53508413498451, -44.5...","[1.6607799328237562e-49, 1.6607799328237562e-4...",3.0
1,3,20,No,1,No,Male,0.0,0.0,0.0,(1.0),(1.0),(1.0),"[1.0, 1.0, 1.0]","[1.0, 1.0, 1.0]","[-79.78840755831078, -79.78840755831078, -79.7...","[8.456060164600909e-88, 8.456060164600909e-88,...",3.0
2,3,20,No,4,No,Male,0.0,0.0,0.0,(1.0),(1.0),(1.0),"[1.0, 1.0, 1.0]","[1.0, 1.0, 1.0]","[-79.78840755831078, -79.78840755831078, -79.7...","[8.456060164600909e-88, 8.456060164600909e-88,...",3.0
3,3,21,No,1,Yes,Male,0.0,1.0,0.0,(1.0),(0.0),(1.0),"[1.0, 0.0, 1.0]","[1.0, 0.0, 1.0]","[-59.747133528916734, -59.747133528916734, -59...","[4.780888560579875e-66, 4.780888560579875e-66,...",3.0
4,3,21,No,2,Yes,Female,0.0,1.0,1.0,(1.0),(0.0),(0.0),"[1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]","[-44.36697081522788, -44.36697081522788, -44.3...","[2.3176073066928632e-49, 2.3176073066928632e-4...",3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,4,51,No,4,No,Male,0.0,0.0,0.0,(1.0),(1.0),(1.0),"[1.0, 1.0, 1.0]","[1.0, 1.0, 1.0]","[-79.78840755831078, -79.78840755831078, -79.7...","[8.456060164600909e-88, 8.456060164600909e-88,...",3.0
429,4,51,No,4,Yes,Male,0.0,1.0,0.0,(1.0),(0.0),(1.0),"[1.0, 0.0, 1.0]","[1.0, 0.0, 1.0]","[-59.747133528916734, -59.747133528916734, -59...","[4.780888560579875e-66, 4.780888560579875e-66,...",3.0
430,4,54,No,4,No,Female,0.0,0.0,1.0,(1.0),(1.0),(0.0),"[1.0, 1.0, 0.0]","[1.0, 1.0, 0.0]","[-64.40824484462192, -64.40824484462192, -64.4...","[4.101250091340616e-71, 4.101250091340616e-71,...",3.0
431,4,55,No,4,No,Male,0.0,0.0,0.0,(1.0),(1.0),(1.0),"[1.0, 1.0, 1.0]","[1.0, 1.0, 1.0]","[-79.78840755831078, -79.78840755831078, -79.7...","[8.456060164600909e-88, 8.456060164600909e-88,...",3.0


In [43]:
labels =  result.toPandas()['label'].values
labels = np.array(labels)

pred_values = result.toPandas()['prediction'].values
pred_values = np.array(pred_values)

errr = np.sqrt(labels - pred_values).mean()
errr

0.16397228637413394

In [70]:
item =  df.first()
item

Row(label=3, Age=41, Attrition='Yes', JobSatisfaction=4, OverTime='Yes', Gender='Female')

In [72]:
result = model.fit(item).transform(item)
result

AttributeError: 'PipelineModel' object has no attribute 'fit'