In [47]:
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import  StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.linalg import Vectors
#from pyspark.ml.datatypes import  StringType , IntegerType
from pyspark.sql.types import *



spark = SparkSession.builder.appName('ml-bank').getOrCreate()

df = spark.read.csv('ibm-hr-analytics-attrition.csv', header = True, inferSchema = True)
df = df.selectExpr("PerformanceRating as label","Age","Attrition","JobSatisfaction","OverTime","Gender")
df.count()

1470

In [53]:
df = df.na.drop()
df.toPandas()

Unnamed: 0,label,Age,Attrition,JobSatisfaction,OverTime,Gender
0,3,41,Yes,4,Yes,Female
1,4,49,No,2,No,Male
2,3,37,Yes,3,Yes,Male
3,3,33,No,3,Yes,Female
4,3,27,No,2,No,Male
...,...,...,...,...,...,...
1465,3,36,No,4,No,Male
1466,3,39,No,1,No,Male
1467,4,27,No,2,Yes,Male
1468,3,49,No,2,No,Male


In [62]:
categorical_variables = [ "Attrition",  "OverTime", "Gender"]
numerical_variables = [ "Age" , "JobSatisfaction"] #

indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) 
    for c in categorical_variables
]
encoders = [
    OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]

feats = [encoder.getOutputCol() for encoder in encoders] + numerical_variables
assembler = VectorAssembler(inputCols= feats , outputCol="features")

scaler =  MinMaxScaler(inputCol = "features" , outputCol= "features_scaled")


stages = indexers + encoders  + [assembler] + [scaler]

pipline = Pipeline(stages = stages)
feats

['Attrition_indexed_encoded',
 'OverTime_indexed_encoded',
 'Gender_indexed_encoded',
 'Age',
 'JobSatisfaction']

In [64]:
lr = LogisticRegression(featuresCol = 'features_scaled', labelCol = 'label', maxIter=100)
model_predictor =  Pipeline(stages = [lr])


In [65]:
model_preprocessor = pipline.fit(df)

In [None]:
train, test = df.randomSplit([0.8,0.2],seed=12345)

In [None]:
model_train_data  = model_preprocessor.transform(train)
model_train_data.toPandas()


In [None]:
model_predictor = model_predictor.fit(model_train_data)

In [None]:
# Test
train, test = df.randomSplit([0.8,0.2],seed=12345)
train.printSchema()
df.printSchema()

In [None]:

labels =  result.toPandas()['label'].values
labels = np.array(labels)

pred_values = result.toPandas()['prediction'].values
pred_values = np.array(pred_values)

errr = np.sqrt(labels - pred_values).mean()
errr

In [None]:
item =  df.first()
item

In [None]:
result = pipeline.fit(item).transform(item)
result