In [1]:
# To do 
# Use one of datasets here https://www.ibm.com/communities/analytics/watson-analytics-blog/guide-to-sample-datasets/ and do same 
# First identity the types of columns (Yes/No, list)
# Arrange the columns in the vector of yes/no, List
# Null should be reaplced with something 

# Goal of this model is to predict Customer Churn using RandomForest
# Data is taken from https://www.ibm.com/communities/analytics/watson-analytics-blog/guide-to-sample-datasets/
from pyspark.sql import SQLContext

from pyspark import SparkContext

sqlContext = SQLContext(sc)

df = sqlContext.read.csv('/FileStore/tables/WA_Fn_UseC__Telco_Customer_Churn-89c80.csv',header='true', inferSchema='true')
df.cache()

In [2]:
# Step 1 - Data Analysis
# Know more about data
# Display the distinct values for Churn and their respective counts

df.select("Churn").distinct().show()
df.groupBy("Churn").count().show()

In [3]:
# This step will replace blank values with zero, if any
# or drop the rows with null values.
df = df.replace('', "0")
# drop rows with null values
df = df.na.drop()
 

In [4]:
df.count() 

In [5]:
df.show(5)

In [6]:
# In the data few columns has other values than Yes/No, replace those values
# it can be done as bulk way also
from pyspark.sql.functions import *

df = df.withColumn("MultipleLines", regexp_replace('MultipleLines', 'No phone service', 'No') )

df = df.withColumn("OnlineSecurity", regexp_replace('OnlineSecurity', 'No internet service', 'No') )
df = df.withColumn("OnlineBackup", regexp_replace('OnlineBackup', 'No internet service', 'No') )
df = df.withColumn("StreamingMovies", regexp_replace('StreamingMovies', 'No internet service', 'No') )
df = df.withColumn("StreamingTV", regexp_replace('StreamingMovies', 'No internet service', 'No') )
df = df.withColumn("TechSupport", regexp_replace('StreamingMovies', 'No internet service', 'No') )
df = df.withColumn("DeviceProtection", regexp_replace('StreamingMovies', 'No internet service', 'No') )




In [7]:
df.select("OnlineSecurity").distinct().show()

In [8]:
# Drop the unwanted column
df = df.drop("customerID")
df = df.drop("tenure")
df.show(5)


In [9]:
# This function is to replace Yes/No/other values as 0 or 1.
recode_dictionary = {
    'YNU': {
        'Yes': 1,
        'No': 0,
        'U': 0
    }
}
import pyspark.sql.types as typ

# Find out Yes/No Columns

# find out y/N/U columns
cols = [(col.name, col.dataType) for col in df.schema]

YNU_cols = []

for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        dis = df.select(s[0]) \
            .distinct() \
            .rdd \
            .map(lambda row: row[0]) \
            .collect()

        if 'Yes' in dis:
            YNU_cols.append(s[0])

In [10]:
print(YNU_cols)

In [11]:
df.select("DeviceProtection").distinct().show()

In [12]:
# now transform the YNU cols to 0/1
import pyspark.sql.functions as func
def recode(col, key):        
    return recode_dictionary[key][col] 
  
rec_integer = func.udf(recode, typ.IntegerType())

exprs_YNU = [
    rec_integer(x, func.lit('YNU')).alias(x) 
    if x in YNU_cols 
    else x 
    for x in df.columns
]

In [13]:
exprs_YNU

In [14]:
df = df.select(exprs_YNU)

In [15]:
df.show(5)
# InternetService is category column nd not binary values so cant be converted into Yes/No

In [16]:
# Identify the categorical columns 
categorical_features = ['gender', 'Contract',
                      'PaymentMethod', 'InternetService' ]

In [17]:
def create_category_vars( dataset, field_name ):
  idx_col = field_name + "Index"
  col_vec = field_name + "Vec"

  month_stringIndexer = StringIndexer( inputCol=field_name,
                                       outputCol=idx_col )

  month_model = month_stringIndexer.fit( dataset )
 #   It will replace the string values with numbers for particular column 
  
  month_indexed = month_model.transform( dataset )

  month_encoder = OneHotEncoder( dropLast=True,
                                 inputCol=idx_col,
                                 outputCol= col_vec )

  return month_encoder.transform( month_indexed )

In [18]:
# OneHot Encoding for all the categorical columns

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, PolynomialExpansion, VectorIndexer

for col in categorical_features:
  df = create_category_vars( df, col )

In [19]:
df.show(5)

In [20]:
# Get all the YNU columns
YNU_cols_updated =  ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']

In [21]:
# create features for vector Assembler - Columns being supplied as a vector 
featureCols = YNU_cols_updated + ['genderVec', 
                                  'ContractVec',
                                  'PaymentMethodVec', 
                                  'InternetServiceVec']   + ['MonthlyCharges','TotalCharges' ]

In [22]:
df = df.select('Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling',
 'genderVec',
 'ContractVec',
 'PaymentMethodVec',
 'InternetServiceVec', df.MonthlyCharges.cast("float"), df.TotalCharges.cast("float"), df.Churn)

df = df.withColumnRenamed("Churn", "label")

In [23]:
# create VectorAssembler to convert the features into vector

assembler = VectorAssembler( inputCols = featureCols, outputCol = "features")

# Combine both vectors into single vector to be processed by an algorithm
customer_train_df = assembler.transform( df )



In [24]:
customer_train_df.printSchema()


In [25]:
# Split the data in 70% & 30% 
customer_train, customer_test = customer_train_df.randomSplit([0.7, 0.3])


In [26]:
# now build the random forest model t

from pyspark.ml.classification import RandomForestClassifier

# Define the input and outut where Label is ouput and features in input vector 
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
# Fitting a model in classifier 
rfModel = rf.fit(customer_train)
# Prediction is done here 
predictions = rfModel.transform(customer_test)

In [27]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model, find accuracy of the model by checking the label(churn value) & predicted value (predction) 
evaluator = BinaryClassificationEvaluator()
accuracyDt = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracyDt))

In [28]:
# Label is Churn columnsuplied as input 
selected = predictions.select("label", "prediction", "features", "MonthlyCharges")
selected.show()

In [29]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(customer_train) 
# Make predictions on test data using the Transformer.transform() method.
predictions_dt = dtModel.transform(customer_test)
predictions_dt.printSchema()

In [30]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
accuracyDt = evaluator.evaluate(predictions_dt)
print("Test Error = %g " % (1.0 - accuracyDt))


In [31]:
# now do the logistic regression  
from pyspark.mllib.classification  import LogisticRegressionWithLBFGS
LR_Model_2 = LogisticRegressionWithLBFGS \
    .train(customer_train, iterations=10)

LR_results_2 = (
        topFeatures_test.map(lambda row: row.label) \
        .zip(LR_Model_2 \
             .predict(customer_test \
                      .map(lambda row: row.features)))
    ).map(lambda row: (row[0], row[1] * 1.0))

LR_evaluation = ev.BinaryClassificationMetrics(LR_results_2)

In [32]:
print('Area under PR: {0:.2f}' \
      .format(LR_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}' \
      .format(LR_evaluation.areaUnderROC))