In [1]:
import pyspark.sql.functions as F
from pyspark.sql import types as tp
from pyspark.sql import SparkSession
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline, Transformer

In [2]:
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "200").config("spark.executor.memory", "4g").config("master", "yarn").getOrCreate()
spark

In [3]:
train = spark.read.csv("ml/train.csv",header = True, inferSchema=True)
test = spark.read.csv("ml/test.csv",header = True, inferSchema=True)

In [4]:
train.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: integer (nullable = true)



### Stages: Draft
stage0: find na and drop null values 

stage1 : label encode Age 

stage2:  label encode Gender 

stage3: Label encode Stay_In_Current_City_Years 

stage4: Lable encode City_Category 

stage5: onehot encode Gender,City,Occupation 

stage6: vectorize the columns-
Gender_ohe,Age_le,Occupation_ohe,City_Category_le,Stay_In_Current_City_Years_le,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3

step7: Baseline ML model(DecisionTreeRegressor) 

step8: Model Improvement with GridSearchCV

In [5]:
class removeNulls(Transformer):
    def __init__(self,x = None):
        self.dataset = x
    def _transform(self,dataset):
        dataset = dataset.dropna(subset = ["Product_Category_2","Product_Category_3"])
        return dataset

In [6]:
selected_features = ["Gender_ohe", "Age_le", "Occupation_ohe", "City_Category_le","Stay_In_Current_City_Years_le","Marital_Status","Product_Category_1","Product_Category_2","Product_Category_3"]

In [7]:
stage1 = removeNulls()
stage2 = StringIndexer(inputCol="Age",outputCol="Age_le")
stage3 = StringIndexer(inputCol="Gender",outputCol="Gender_le")
stage4 = StringIndexer(inputCol="Stay_In_Current_City_Years",outputCol="Stay_In_Current_City_Years_le")
stage5 = StringIndexer(inputCol="City_Category",outputCol="City_Category_le")
stage6 = OneHotEncoder(inputCols = ["Gender_le","City_Category_le","Occupation"] 
                       ,outputCols=["Gender_ohe","City_Category_ohe","Occupation_ohe"])
stage7 = VectorAssembler(inputCols= selected_features, outputCol = "feature_vector")
stage8 = LinearRegression(featuresCol="feature_vector", labelCol= "Purchase")

In [9]:
pipeline = Pipeline(stages = [stage1,
                           stage2,
                           stage3,
                           stage4,
                           stage5,
                           stage6,
                           stage7,
                           stage8])

pipeline_model = pipeline.fit(train)
final_data = pipeline_model.transform(train)

In [13]:
final_data.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: integer (nullable = true)
 |-- Age_le: double (nullable = false)
 |-- Gender_le: double (nullable = false)
 |-- Stay_In_Current_City_Years_le: double (nullable = false)
 |-- City_Category_le: double (nullable = false)
 |-- Gender_ohe: vector (nullable = true)
 |-- City_Category_ohe: vector (nullable = true)
 |-- Occupation_ohe: vector (nullable = true)
 |-- feature_vector: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [15]:
test_data = pipeline_model.transform(test)
test_data.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Age_le: double (nullable = false)
 |-- Gender_le: double (nullable = false)
 |-- Stay_In_Current_City_Years_le: double (nullable = false)
 |-- City_Category_le: double (nullable = false)
 |-- Gender_ohe: vector (nullable = true)
 |-- City_Category_ohe: vector (nullable = true)
 |-- Occupation_ohe: vector (nullable = true)
 |-- feature_vector: vector (nullable = true)
 |-- prediction: double (nullable = false)

