## Spark Data Operation and Machine Learning
- The purpose is to learn about spark data operations using pyspark (Spark SQL) as well as learn to do machine learning with pyspark (MLlib)

### Set up

In [1]:
!wget https://github.com/ppkgtmm/big-data/raw/main/setup_spark_colab.sh

--2022-03-06 17:51:05--  https://github.com/ppkgtmm/big-data/raw/main/setup_spark_colab.sh
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ppkgtmm/big-data/main/setup_spark_colab.sh [following]
--2022-03-06 17:51:05--  https://raw.githubusercontent.com/ppkgtmm/big-data/main/setup_spark_colab.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 320 [text/plain]
Saving to: ‘setup_spark_colab.sh’


2022-03-06 17:51:05 (8.24 MB/s) - ‘setup_spark_colab.sh’ saved [320/320]



In [2]:
%%bash
chmod +x ./setup_spark_colab.sh
./setup_spark_colab.sh &> /dev/null

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop2.7"

### Import libraries

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, countDistinct, col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, \
IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pandas as pd

### Create session
- Spark session is an entry point to spark applicaion

In [5]:
spark = SparkSession \
    .builder \
    .appName("Lecture 7 Spark") \
    .getOrCreate()

#### Note: All .toPandas() here are unnecessary in the real use. We use them here in order to display the result only.

### Spark SQL

In [6]:
spark.sql('SHOW DATABASES').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



#### Create Spark DataFrame from a file

In [7]:
df = (
    spark
    .read
    .format('csv')
    .option('inferSchema', True)
    .option('delimiter',',')
    .option('header', True)
    .load('Superstore.csv')
)

In [8]:
df.limit(5).toPandas()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


#### Rename columns

In [9]:
new_names = [c.replace(' ','_') for c in df.columns]
print(new_names)

df = df.toDF(*new_names)
df.limit(5).toPandas()

['Row_ID', 'Order_ID', 'Order_Date', 'Ship_Date', 'Ship_Mode', 'Customer_ID', 'Customer_Name', 'Segment', 'Country', 'City', 'State', 'Postal_Code', 'Region', 'Product_ID', 'Category', 'Sub-Category', 'Product_Name', 'Sales', 'Quantity', 'Discount', 'Profit']


Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Postal_Code,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


#### Write DataFrame to Hive table

In [10]:
df.write.format('parquet').mode('overwrite').saveAsTable('superstore')

In [11]:
spark.sql('SHOW TABLES').show()

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  default|superstore|      false|
+---------+----------+-----------+



In [12]:
df = spark.sql('SELECT * FROM superstore')

In [13]:
df.limit(5).toPandas()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Postal_Code,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


#### Print DataFrame structure

In [14]:
(df.count(), len(df.columns))

(9994, 21)

In [15]:
df.printSchema()

root
 |-- Row_ID: integer (nullable = true)
 |-- Order_ID: string (nullable = true)
 |-- Order_Date: timestamp (nullable = true)
 |-- Ship_Date: timestamp (nullable = true)
 |-- Ship_Mode: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal_Code: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Profit: double (nullable = true)



#### Show DataFrame

In [16]:
df.limit(1).toPandas()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Postal_Code,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0,41.9136


#### Select columns

In [17]:
df.select('Sales').toPandas()

Unnamed: 0,Sales
0,261.96
1,731.94
2,14.62
3,957.5775
4,22.368
...,...
9989,25.248
9990,91.96
9991,258.576
9992,29.6


In [18]:
df.select(['Sales','Profit']).toPandas()

Unnamed: 0,Sales,Profit
0,261.96,41.9136
1,731.94,219.5820
2,14.62,6.8714
3,957.5775,-383.0310
4,22.368,2.5164
...,...,...
9989,25.248,4.1028
9990,91.96,15.6332
9991,258.576,19.3932
9992,29.6,13.3200


#### Sorting

In [19]:
df.orderBy('Order_ID').limit(5).toPandas()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Postal_Code,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit
0,2718,CA-2014-100006,2014-09-07,2014-09-13,Standard Class,DK-13375,Dennis Kane,Consumer,United States,New York City,...,10024,East,TEC-PH-10002075,Technology,Phones,AT&T EL51110 DECT,377.97,3.0,0.0,109.6113
1,6289,CA-2014-100090,2014-07-08,2014-07-12,Standard Class,EB-13705,Ed Braxton,Corporate,United States,San Francisco,...,94122,West,OFF-BI-10001597,Office Supplies,Binders,"Wilson Jones Ledger-Size, Piano-Hinge Binder, 2""","Blue""",196.704,6.0,0.2
2,6288,CA-2014-100090,2014-07-08,2014-07-12,Standard Class,EB-13705,Ed Braxton,Corporate,United States,San Francisco,...,94122,West,FUR-TA-10003715,Furniture,Tables,Hon 2111 Invitation Series Corner Table,502.488,3.0,0.2,-87.9354
3,9515,CA-2014-100293,2014-03-14,2014-03-18,Standard Class,NF-18475,Neil Französisch,Home Office,United States,Jacksonville,...,32216,South,OFF-PA-10000176,Office Supplies,Paper,Xerox 1887,91.056,6.0,0.2,31.8696
4,3084,CA-2014-100328,2014-01-28,2014-02-03,Standard Class,JC-15340,Jasper Cacioppo,Consumer,United States,New York City,...,10024,East,OFF-BI-10000343,Office Supplies,Binders,"""Pressboard Covers with Storage Hooks, 9 1/2""""...","Light Blue""",3.928,1.0,0.2


#### Filtering

In [20]:
df[df.Ship_Mode == 'Second Class'].limit(5).toPandas()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Postal_Code,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0,6.8714
3,18,CA-2014-167164,2014-05-13,2014-05-15,Second Class,AG-10270,Alejandro Grove,Consumer,United States,West Jordan,...,84084,West,OFF-ST-10000107,Office Supplies,Storage,Fellowes Super Stor/Drawer,55.5,2,0,9.99
4,19,CA-2014-143336,2014-08-27,2014-09-01,Second Class,ZD-21925,Zuschuss Donatelli,Consumer,United States,San Francisco,...,94109,West,OFF-AR-10003056,Office Supplies,Art,Newell 341,8.56,2,0,2.4824


In [21]:
df.filter('Ship_Mode == "Standard Class" and Customer_ID  == "SO-20335"').toPandas()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Postal_Code,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit
0,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
1,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164
2,3374,CA-2015-161718,2015-12-04,2015-12-10,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Hempstead,...,11550,East,FUR-FU-10002445,Furniture,Furnishings,"DAX Two-Tone Rosewood/Black Document Frame, De...",28.44,3,0.0,11.376
3,3375,CA-2015-161718,2015-12-04,2015-12-10,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Hempstead,...,11550,East,FUR-CH-10002372,Furniture,Chairs,Office Star - Ergonomically Designed Knee Chair,364.41,5,0.1,8.098
4,3376,CA-2015-161718,2015-12-04,2015-12-10,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Hempstead,...,11550,East,TEC-PH-10000376,Technology,Phones,Square Credit Card Reader,39.96,4,0.0,10.3896
5,3377,CA-2015-161718,2015-12-04,2015-12-10,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Hempstead,...,11550,East,FUR-CH-10002965,Furniture,Chairs,Global Leather Highback Executive Chair with P...,361.764,2,0.1,68.3332
6,4623,CA-2017-147228,2017-09-09,2017-09-14,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Columbia,...,38401,South,OFF-SU-10001225,Office Supplies,Supplies,Staple remover,8.832,3,0.2,-1.9872
7,4624,CA-2017-147228,2017-09-09,2017-09-14,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Columbia,...,38401,South,OFF-PA-10000357,Office Supplies,Paper,Xerox 1888,177.536,4,0.2,62.1376
8,4625,CA-2017-147228,2017-09-09,2017-09-14,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Columbia,...,38401,South,OFF-ST-10000046,Office Supplies,Storage,Fellowes Super Stor/Drawer Files,258.48,2,0.2,-3.231
9,4626,CA-2017-147228,2017-09-09,2017-09-14,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Columbia,...,38401,South,FUR-FU-10000023,Furniture,Furnishings,Eldon Wave Desk Accessories,14.136,3,0.2,4.2408


#### Add a constant column

In [22]:
from pyspark.sql.functions import lit

# not mutate original data
df.withColumn('One',lit(1)).limit(5).toPandas()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit,One
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,1
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,1
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,1
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031,1
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164,1


In [23]:
df.limit(5).toPandas()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Postal_Code,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


#### Add a column (from other columns)

In [24]:
df.withColumn('Margin',df['Profit']/df['Sales']).limit(5).toPandas()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,...,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit,Margin
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,0.16
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,0.3
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,0.47
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031,-0.4
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164,0.1125


#### Replace/update the column

In [25]:
df1 = df.withColumn('Unit_Price', df['Sales'] + df['Profit'])

In [26]:
df1.withColumn('Unit_Price',df1['Unit_Price'] / df1['Quantity'])\
   .select(['Product_ID','Product_Name','Unit_Price']).limit(5).show()

+---------------+--------------------+------------------+
|     Product_ID|        Product_Name|        Unit_Price|
+---------------+--------------------+------------------+
|FUR-BO-10001798|Bush Somerset Col...|          151.9368|
|FUR-CH-10000454|Hon Deluxe Fabric...|317.17400000000004|
|OFF-LA-10000240|Self-Adhesive Add...|           10.7457|
|FUR-TA-10000577|Bretford CR4500 S...|114.90929999999999|
|OFF-ST-10000760|Eldon Fold 'N Rol...|           12.4422|
+---------------+--------------------+------------------+



#### Convert a column data type

In [27]:
df.withColumn('Postal_Code',df['Postal_Code'].cast('string'))\
  .printSchema()

root
 |-- Row_ID: integer (nullable = true)
 |-- Order_ID: string (nullable = true)
 |-- Order_Date: timestamp (nullable = true)
 |-- Ship_Date: timestamp (nullable = true)
 |-- Ship_Mode: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal_Code: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Profit: double (nullable = true)



#### Drop columns

In [28]:
df.drop('Sales','Discount','Profit').printSchema()

root
 |-- Row_ID: integer (nullable = true)
 |-- Order_ID: string (nullable = true)
 |-- Order_Date: timestamp (nullable = true)
 |-- Ship_Date: timestamp (nullable = true)
 |-- Ship_Mode: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal_Code: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Quantity: string (nullable = true)



#### Check distribution

In [29]:
df.describe().toPandas()

Unnamed: 0,summary,Row_ID,Order_ID,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,State,Postal_Code,Region,Product_ID,Category,Sub-Category,Product_Name,Sales,Quantity,Discount,Profit
0,count,9994.0,9994,9994,9994,9994,9994,9994,9994,9994,9994.0,9994,9994,9994,9994,9994,9994,9994,9994,9994.0
1,mean,4997.5,,,,,,,,,55190.3794276566,,,,,,234.41818199917006,5.828590535392018,0.3155949113492862,28.587912967780834
2,stddev,2885.1636290974325,,,,,,,,,32063.693350364487,,,,,,631.7890112674363,25.520975563736403,3.314008629792499,234.3891156047269
3,min,1.0,CA-2014-100006,First Class,AA-10315,Aaron Bergman,Consumer,United States,Aberdeen,Alabama,1040.0,Central,FUR-BO-10000112,Furniture,Accessories,"""""""While you Were Out"""" Message Book","10/Pack""","1040 sheets""","30/Box""",-6599.978
4,max,9994.0,US-2017-169551,Standard Class,ZD-21925,Zuschuss Donatelli,Home Office,United States,Yuma,Wyoming,99301.0,West,TEC-PH-10004977,Technology,Tables,netTALK DUO VoIP Telephone Service,999.98,98.352,98.352,8399.976


#### Check missing data

In [30]:
df.filter("Sales is NULL").count()

0

In [31]:
df.filter(df['Sales'].isNull()).count()

0

#### Aggregation

In [32]:
df.groupBy('Category').agg({'Sales':'sum'}).show()

+---------------+-----------------+
|       Category|       sum(Sales)|
+---------------+-----------------+
|Office Supplies|703502.9280000031|
|      Furniture|733046.8612999996|
|     Technology|835900.0669999964|
+---------------+-----------------+



#### Customer single view

In [33]:
df.groupBy('Customer_ID')\
  .agg({'Sales':'sum','Order_ID':'count'}).show()

+-----------+---------------+------------------+
|Customer_ID|count(Order_ID)|        sum(Sales)|
+-----------+---------------+------------------+
|   VW-21775|             18|          6112.702|
|   PB-19210|              2|           132.738|
|   RR-19315|              4|           615.932|
|   EM-13960|              6|           933.704|
|   MY-17380|             13|2242.6119999999996|
|   MS-17530|              7|475.65599999999995|
|   KH-16630|             17|          3918.966|
|   BD-11500|             10|4408.2970000000005|
|   SW-20275|              7|           1966.65|
|   AH-10690|             23| 7888.294000000001|
|   PH-18790|              2|           729.648|
|   JF-15490|             15|          1082.918|
|   JF-15415|             14|          2371.448|
|   IM-15070|             21| 4930.473999999999|
|   PW-19240|             12|3878.3149999999996|
|   JH-15985|             14|          7954.998|
|   OT-18730|             10|3818.6240000000003|
|   KM-16225|       

### MLlib

#### Import data

In [34]:
df = (
    spark
    .read
    .option('header', True)
    .option('inferSchema', True)
    .csv('telco.csv')
)
df.limit(5).toPandas()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#### Drop ID column

In [35]:
df = df.drop('customerID')
df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: double (nullable = true)
 |-- Churn: string (nullable = true)



#### Check data distribution

In [36]:
df.describe().toPandas().set_index('summary').T

summary,count,mean,stddev,min,max
gender,7043,,,Female,Male
SeniorCitizen,7043,0.1621468124378816,0.3686116056100135,0,1
Partner,7043,,,No,Yes
Dependents,7043,,,No,Yes
tenure,7043,32.37114865824223,24.55948102309444,0,72
PhoneService,7043,,,No,Yes
MultipleLines,7043,,,No,Yes
InternetService,7043,,,DSL,No
OnlineSecurity,7043,,,No,Yes
OnlineBackup,7043,,,No,Yes


In [37]:
num_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
cat_cols = list(set(df.columns) - set(num_cols))

#### Split data into train/test

In [38]:
(trainset, testset) = df.randomSplit([0.7, 0.3], seed=11)
trainset.count(), testset.count()

(4958, 2085)

#### Feature transformation

In [39]:
gender_indexer = (
    StringIndexer(inputCol='gender', outputCol='gender_prep')
    .fit(df)
)
(
  gender_indexer
 .transform(df)
 .limit(5)
 .select(['gender', 'gender_prep'])
 .toPandas()
)

Unnamed: 0,gender,gender_prep
0,Female,1.0
1,Male,0.0
2,Male,0.0
3,Male,0.0
4,Female,1.0


#### Check binary columns

In [40]:
distinctDf = (
    df
    .agg(*(countDistinct(col(c)).alias(c) for c in cat_cols))
    .toPandas()
)
distinctDf

Unnamed: 0,OnlineBackup,TechSupport,StreamingTV,PaperlessBilling,Dependents,StreamingMovies,PhoneService,Contract,Partner,InternetService,OnlineSecurity,PaymentMethod,DeviceProtection,Churn,gender,MultipleLines
0,3,3,3,2,2,3,2,3,2,3,3,4,3,2,2,3


In [41]:
preprocessor = []
cat_prep_cols = []
for col in distinctDf.columns:
  if col == 'Churn':
    continue
  card = distinctDf[col][0]
  if card <= 2:
    prep = StringIndexer(inputCol=col, outputCol=col + '_prep')
  else:
    ind = StringIndexer(inputCol=col, outputCol=col + '_index')
    preprocessor.append(ind)
    prep = OneHotEncoder(inputCol=col + '_index', outputCol=col + '_prep')
  preprocessor.append(prep)
  cat_prep_cols.append(col + '_prep')

target_name = 'Churn_prep'
churn_indexer = StringIndexer(inputCol='Churn', outputCol=target_name)

#### Assemble features

In [42]:
feature_cols = cat_prep_cols + num_cols

In [43]:
vec_assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

#### Random Forest model

In [44]:
rf = RandomForestClassifier(featuresCol='features', labelCol=target_name)

#### Label inversion

In [45]:
label_inverter = IndexToString(
    inputCol='prediction',
    outputCol='predictionLabel',
    labels = churn_indexer.fit(trainset).labels
)

#### Build pipeline

In [46]:
pipeline = Pipeline(
    stages=preprocessor + [churn_indexer, vec_assembler, rf, label_inverter]
)

#### Train model

In [47]:
model = pipeline.fit(trainset)

#### Predict

In [48]:
predictions = model.transform(testset)
(
  predictions
 .select(['Churn', 'predictionLabel', 'probability'])
 .limit(5)
 .toPandas()
)

Unnamed: 0,Churn,predictionLabel,probability
0,No,No,"[0.536137387189313, 0.46386261281068697]"
1,Yes,No,"[0.536137387189313, 0.46386261281068697]"
2,Yes,No,"[0.5228847934661265, 0.4771152065338734]"
3,No,No,"[0.6665744897555973, 0.33342551024440276]"
4,No,No,"[0.6547466815649626, 0.34525331843503737]"


#### Evaluation

In [49]:
acc_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_name, 
    predictionCol='prediction', 
    metricName='accuracy'
)
print('Test Accuracy = {:.4f}'.format(acc_evaluator.evaluate(predictions)))

Test Accuracy = 0.8005


In [50]:
prec_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_name, 
    predictionCol='prediction', 
    metricName='weightedPrecision'
)
print('Test Precision = {:.4f}'.format(prec_evaluator.evaluate(predictions)))

Test Precision = 0.7885


In [51]:
rec_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_name, 
    predictionCol='prediction', 
    metricName='weightedRecall'
)
print('Test Recall = {:.4f}'.format(rec_evaluator.evaluate(predictions)))

Test Recall = 0.8005


In [52]:
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_name, 
    predictionCol='prediction', 
    metricName='f1'
)
print('Test F1 = {:.4f}'.format(f1_evaluator.evaluate(predictions)))

Test F1 = 0.7766


In [53]:
auc_evaluator = BinaryClassificationEvaluator(
    labelCol=target_name, 
    rawPredictionCol='rawPrediction', 
    metricName='areaUnderROC'
)
print('Test AUC = {:.4f}'.format(auc_evaluator.evaluate(predictions)))

Test AUC = 0.8548


#### Parameter tuning

In [54]:
paramGrid = (
    ParamGridBuilder()
    .addGrid(rf.numTrees, [10, 100]) # values specified are fixed values
    .addGrid(rf.maxDepth, [4, 8, 12])
    .build()
)

In [55]:
evaluator_bin = BinaryClassificationEvaluator(
    labelCol=target_name, 
    rawPredictionCol='rawPrediction'
)
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator_bin,
    numFolds=3,
    seed=11
)

In [56]:
cv_model = crossval.fit(trainset)
best_model = cv_model.bestModel

In [57]:
pd.DataFrame.from_dict([
  {cv_model.getEvaluator().getMetricName() : metric, **p}
  for p, metric in zip(paramGrid, cv_model.avgMetrics)
])

Unnamed: 0,areaUnderROC,RandomForestClassifier_26af3a258eb8__numTrees,RandomForestClassifier_26af3a258eb8__maxDepth
0,0.821948,10,4
1,0.829782,10,8
2,0.809189,10,12
3,0.834467,100,4
4,0.840185,100,8
5,0.827187,100,12


#### Prediction

In [58]:
cv_preds = best_model.transform(testset)

#### Evaluation

In [59]:
prec_tuned_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_name, 
    predictionCol='prediction', 
    metricName='weightedPrecision'
)
print('Tuned Test Precision = {:.4f}'.format(prec_tuned_evaluator.evaluate(cv_preds)))

Tuned Test Precision = 0.8070


In [60]:
rec_tuned_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_name, 
    predictionCol='prediction', 
    metricName='weightedRecall'
)
print('Tuned Test Recall = {:.4f}'.format(rec_tuned_evaluator.evaluate(cv_preds)))

Tuned Test Recall = 0.8168


In [61]:
f1_tuned_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_name, 
    predictionCol='prediction', 
    metricName='f1'
)
print('Tuned Test F1 = {:.4f}'.format(f1_tuned_evaluator.evaluate(cv_preds)))

Tuned Test F1 = 0.8079


In [62]:
auc_tuned_evaluator = BinaryClassificationEvaluator(
    labelCol=target_name, 
    rawPredictionCol='rawPrediction', 
    metricName='areaUnderROC'
)
print('Test AUC = {:.4f}'.format(auc_tuned_evaluator.evaluate(cv_preds)))

Test AUC = 0.8567
