In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('logreg_consult').getOrCreate()

# Read data in transform appropriately

In [4]:
df = spark.read.csv('customer_churn.csv',inferSchema=True,header=True)

In [5]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [6]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(inputCols=['Age',
                                         'Total_Purchase',
                                         'Account_Manager',
                                         'Years',
                                         'Num_Sites'], outputCol='features')

In [9]:
output = assembler.transform(df)

In [10]:
final_df = output.select('features','churn')

In [11]:
final_df.show(4)

+--------------------+-----+
|            features|churn|
+--------------------+-----+
|[42.0,11066.8,0.0...|    1|
|[41.0,11916.22,0....|    1|
|[38.0,12884.75,0....|    1|
|[42.0,8010.76,0.0...|    1|
+--------------------+-----+
only showing top 4 rows



# Create model

In [12]:
from pyspark.ml.classification import LogisticRegression

In [13]:
logreg_churn = LogisticRegression(labelCol='churn')

### Fit model to train_churn data

In [14]:
fitted_churn_model = logreg_churn.fit(train_churn)

NameError: name 'train_churn' is not defined

### Summary for exploration

In [None]:
training_summary = fitted_churn_model.summary

In [None]:
training_summary.predictions.describe().show()

### Evaluate model

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# test `fitted_churn_model` with `test_churn` data
pred_and_labels = fitted_churn_model.evaluate(test_churn)

In [None]:
pred_and_labels.predictions.show(10)

In [None]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='churn')

In [None]:
auc = churn_eval.evaluate(pred_and_labels.predictions)

In [None]:
auc

### Predict on new data

In [17]:
final_legreg_model = logreg_churn.fit(final_df)

In [18]:
new_customers = spark.read.csv('new_customers.csv',inferSchema=True,header=True)

In [19]:
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [20]:
test_new_customers = assembler.transform(new_customers)

In [21]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [22]:
final_results = final_legreg_model.transform(test_new_customers)

In [23]:
final_results.select('Company','prediction').show(10)

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

