## Imports

In [1]:
import pyspark  # Spark
from pyspark.ml.feature import VectorAssembler  # Combine features
from pyspark.ml.classification import DecisionTreeClassifier  # Classify

## Data sourcing

In [2]:
# Create/Open a session

spark = pyspark.sql.SparkSession.builder.master('local[*]').appName('customer_churn').getOrCreate()

In [3]:
# Load in the data

df = spark.read.csv('agency_churn.csv', 
                     header = True, inferSchema = True)

In [4]:
# Check the type

type(df)

pyspark.sql.dataframe.DataFrame

## Exploration

In [5]:
# Number of rows & columns

df.count(), len(df.columns)

(900, 9)

In [6]:
# Dtypes

df.dtypes

[('Names', 'string'),
 ('Age', 'int'),
 ('Total_Purchase', 'double'),
 ('Account_Manager', 'int'),
 ('Years', 'double'),
 ('Onboard_date', 'string'),
 ('Location', 'string'),
 ('Company', 'string'),
 ('Churn', 'int')]

In [7]:
# View the data

df.show(3)

+----------------+---+--------------+---------------+-----+------------------+--------------------+--------------------+-----+
|           Names|Age|Total_Purchase|Account_Manager|Years|      Onboard_date|            Location|             Company|Churn|
+----------------+---+--------------+---------------+-----+------------------+--------------------+--------------------+-----+
|Cameron Williams| 42|       11066.8|              0| 7.22|2013-08-30 7:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller| 41|      11916.22|              0|  6.5|2013-08-13 0:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano| 38|      12884.75|              0| 6.67|2016-06-29 6:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
+----------------+---+--------------+---------------+-----+------------------+--------------------+--------------------+-----+
only showing top 3 rows



In [8]:
# Select  subset of columns

df.select("Names", "Age", "Years").show(5)

+----------------+---+-----+
|           Names|Age|Years|
+----------------+---+-----+
|Cameron Williams| 42| 7.22|
|   Kevin Mueller| 41|  6.5|
|     Eric Lozano| 38| 6.67|
|   Phillip White| 42| 6.71|
|  Cynthia Norton| 37| 5.56|
+----------------+---+-----+
only showing top 5 rows



In [9]:
# Summary stats

df.select("Names", "Age", "Years").describe().show()

+-------+-------------+-----------------+-----------------+
|summary|        Names|              Age|            Years|
+-------+-------------+-----------------+-----------------+
|  count|          900|              898|              900|
|   mean|         null|41.81069042316258| 5.27315555555555|
| stddev|         null| 6.13303075073418|1.274449013194616|
|    min|   Aaron King|               22|              1.0|
|    max|Zachary Walsh|               65|             9.15|
+-------+-------------+-----------------+-----------------+



In [10]:
# Unique values in a column

df.select('Churn').distinct().show()

+-----+
|Churn|
+-----+
|    1|
|    0|
+-----+



In [11]:
# Group by

df.groupBy('Account_Manager').mean().select("Account_Manager", "avg(Age)").show()

+---------------+------------------+
|Account_Manager|          avg(Age)|
+---------------+------------------+
|           null|              30.0|
|              1| 41.73720930232558|
|              0|41.903640256959314|
+---------------+------------------+



## It's secretly all SQL

In [12]:
# Register the whole table (irritating temporary step)

df.createOrReplaceTempView("churn_table")

# Query it with SQL

spark.sql("SELECT Names, Age, Total_Purchase FROM churn_table LIMIT 4").show()

+----------------+---+--------------+
|           Names|Age|Total_Purchase|
+----------------+---+--------------+
|Cameron Williams| 42|       11066.8|
|   Kevin Mueller| 41|      11916.22|
|     Eric Lozano| 38|      12884.75|
|   Phillip White| 42|       8010.76|
+----------------+---+--------------+



## Cleaning

In [13]:
# Filter

df.filter(df['Age'] > 40).count()

522

In [14]:
# Find nulls in a specific column

df.filter(df["Age"].isNull()).count()

2

In [15]:
# Remove nulls

df = df.dropna(how = 'any', subset= None)

In [16]:
# Create column

df = df.withColumn(colName= 'Incorrect_Age', col= df['Age'] -7)

In [17]:
# Drop column

df = df.drop("Incorrect_Age")

## Classification

### Feature selection

In [18]:
# Select just the numeric columns as predictors

predictors = ['Age', 'Total_Purchase', 'Account_Manager', 'Years']

In [19]:
# For Spark, each row needs to be a vector

# Get a vectoriser

vectoriser = VectorAssembler(inputCols = predictors, outputCol = 'Features')

# Vectorise the dataframe, combining all the predictors into a new column

cl_data = vectoriser.transform(df)

In [20]:
# View the predictors and target

cl_data.select('Churn', 'Features').show(5)

+-----+--------------------+
|Churn|            Features|
+-----+--------------------+
|    1|[42.0,11066.8,0.0...|
|    1|[41.0,11916.22,0....|
|    1|[38.0,12884.75,0....|
|    1|[42.0,8010.76,0.0...|
|    1|[37.0,9191.58,0.0...|
+-----+--------------------+
only showing top 5 rows



## Split the data

In [21]:
# Simplify the data

cl_data = cl_data.select("Churn", "Features")

In [22]:
# Split off train and test

train, test = cl_data.randomSplit([0.7, 0.3], seed=451)

## Build a classifier

Not a *good* classifier, just an example one.

In [23]:
# Create the classifier

tree = DecisionTreeClassifier(featuresCol="Features", labelCol = 'Churn')

In [24]:
# Fit it

fitted_tree = tree.fit(train)

In [25]:
# Make predictions

pred = fitted_tree.transform(test)

In [26]:
# Hackily evaluate it; how many did we get wrong?

pred.count() - pred.filter(pred["Churn"] == pred["prediction"]).count()

57