In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('log_reg_project').getOrCreate()

In [3]:
df = spark.read.csv('/FileStore/tables/customer_churn.csv', inferSchema=True, header=True)

In [4]:
df.show()

In [5]:
df.printSchema()

In [6]:
df.na.df.show()

In [7]:
df.columns

In [8]:
from pyspark.sql import functions as F

In [9]:
for column in df.columns:
  df.where(F.isnull(F.col(column))).show()

In [10]:
my_cols = df.select(['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'])

In [11]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [12]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'], outputCol='features')

In [13]:
output = assembler.transform(df)

In [14]:
final_data = output.select('features', 'churn')

In [15]:
from pyspark.ml.classification import LogisticRegression

In [16]:
log_reg_churn = LogisticRegression(featuresCol='features', labelCol='churn')

In [18]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [19]:
fit_model = log_reg_churn.fit(train_data)

In [20]:
training_summary = fit_model.summary
training_summary.predictions.describe().show()

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [22]:
pred_and_labels = fit_model.evaluate(test_data)

In [23]:
pred_and_labels.predictions.show()

In [24]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')

In [25]:
auc = churn_eval.evaluate(pred_and_labels.predictions)

In [26]:
auc

In [27]:
### predict on new dataset

In [28]:
final_lr_model = log_reg_churn.fit(final_data)

In [29]:
new_customers = spark.read.csv('/FileStore/tables/new_customers.csv', inferSchema=True, header=True)

In [30]:
new_customers.show()

In [31]:
new_customers.describe().show()

In [32]:
new_customers.printSchema()

In [33]:
test_new_customers = assembler.transform(new_customers)

In [34]:
test_new_customers.printSchema()

In [35]:
final_results = final_lr_model.transform(test_new_customers)

In [36]:
# Note that there were only 6 rows in new customer data
final_results.select('Company','prediction').show()