In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

import sklearn.metrics as metrics

import category_encoders as ce

In [2]:
data = pd.read_csv('train_oSwQCTC/train.csv')
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


# Data Exploration

In [3]:
data.describe(include='all')

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068,550068,550068,550068.0,550068,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
unique,,3631,2,7,,3,5.0,,,,,
top,,P00265242,M,26-35,,B,1.0,,,,,
freq,,1880,414259,219587,,231173,193821.0,,,,,
mean,1003029.0,,,,8.076707,,,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,,,,6.52266,,,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,,,,0.0,,,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,,,,2.0,,,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,,,,7.0,,,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,,,,14.0,,,1.0,8.0,15.0,16.0,12054.0


In [None]:
sns.distplot(data['Purchase'])

In [None]:
print('Skew of target variable: ', np.round(data['Purchase'].skew(),2))
print('Kurtosis of target variable: ', np.round(data['Purchase'].kurt(),2))

### Univariate Analysis

In [None]:
sns.countplot(data['Occupation'])

In [None]:
sns.countplot(data['Marital_Status'])

In [None]:
sns.countplot(data['Product_Category_1'])

In [None]:
sns.countplot(data['Product_Category_2'])

In [None]:
sns.countplot(data['Product_Category_3'])

In [None]:
sns.countplot(data['Gender'])

In [None]:
sns.countplot(data['Age'])

In [None]:
sns.countplot(data['City_Category'])

In [None]:
sns.countplot(data['Stay_In_Current_City_Years'])

### Bi-variate Analysis

In [None]:
data.groupby('Occupation')['Purchase'].sum().plot(kind='bar')

In [None]:
def purchase_analysis_plots(column, aggr_func):
    plt.figure(figsize=(10,6))
    if aggr_func == 'mean':
        data.groupby(column)['Purchase'].mean().plot(kind='bar')
    elif aggr_func == 'sum':
        data.groupby(column)['Purchase'].sum().plot(kind='bar')
    elif aggr_func == 'count':
        data.groupby(column)['Purchase'].count().plot(kind='bar')
    else:
        print('Error')
    
    
    plt.xlabel(column)
    plt.ylabel('Purchase')
    plt.xticks(rotation=0)

In [None]:
purchase_analysis_plots('Occupation', 'mean')

In [None]:
purchase_analysis_plots('Marital_Status', 'mean')

In [None]:
data.groupby('Stay_In_Current_City_Years')['Stay_In_Current_City_Years'].count().plot(kind='bar')

### Split the dataset into Train and Test

In [68]:
import random
data_index = data.index.tolist()
random.shuffle(data_index)
split = 0.2
test_size = int(round(len(data_index)*split,0))

train_index = data_index[:-test_size]
test_index = data_index[-test_size:]

data_train = data.loc[train_index, :]
data_test = data.loc[test_index, :]

print("Train dataset size: ",data_train.shape)
print("Test dataset size: ",data_test.shape)

Train dataset size:  (440054, 12)
Test dataset size:  (110014, 12)


#### Save the Test dataset

In [69]:
import pickle

with open('test_data.pickle', 'wb') as f:
    pickle.dump(data_test, f)

In [6]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440054 entries, 357406 to 234799
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     440054 non-null  int64  
 1   Product_ID                  440054 non-null  object 
 2   Gender                      440054 non-null  object 
 3   Age                         440054 non-null  object 
 4   Occupation                  440054 non-null  int64  
 5   City_Category               440054 non-null  object 
 6   Stay_In_Current_City_Years  440054 non-null  object 
 7   Marital_Status              440054 non-null  int64  
 8   Product_Category_1          440054 non-null  int64  
 9   Product_Category_2          301258 non-null  float64
 10  Product_Category_3          133617 non-null  float64
 11  Purchase                    440054 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 43.6+ MB


## Pre-process the data

### Product Category Column

In [97]:
'''
P1_dict = {1:'A',  2:'B',  3:'C',  4:'D',  5:'E',  6:'F',  7:'G',  8:'H',  9:'I', 10:'J', 11:'K', 12:'L',13:'M', 14:'N', 15:'O', 16:'P', 17:'Q', 18:'R', 19:'S', 20:'T'}
P2_dict = {2.:'a', 3.:'b', 4.:'c', 5.:'d', 6.:'e', 7.:'f', 8.:'g', 9.:'h', 10.:'i', 11.:'j', 12.:'k', 13.:'l', 14.:'m', 15.:'n', 16.:'o', 17.:'p', 18.:'q', 0.:''}
P3_dict = {3.:'A', 4.:'B', 5.:'C', 6.:'D', 8.:'E', 9.:'F', 10.:'G', 11.:'H', 12.:'I', 13.:'J', 14.:'K', 15.:'L', 16.:'M', 17.:'N', 18.:'O', 0.:''}

def process_product_category(df):
    # Fill the empty cells of Product Category 1 & 2 with 0
    df['Product_Category_2'] = df['Product_Category_2'].fillna(value=0)
    df['Product_Category_3'] = df['Product_Category_3'].fillna(value=0)
    
    # Map the product category with Alphabetical value
    df['Product_Category_1'] = df['Product_Category_1'].map(P1_dict)
    df['Product_Category_2'] = df['Product_Category_2'].map(P2_dict)
    df['Product_Category_3'] = df['Product_Category_3'].map(P3_dict)
    
    # Concatenate the Product column values to make it one value
    df['Product_Category'] = df['Product_Category_1'] + df['Product_Category_2'] + df['Product_Category_3']
    
    df.drop(['Product_Category_1', 'Product_Category_2', 'Product_Category_3'], axis=1, inplace=True)
    
    return df
'''

def process_product_category(df, train_flag):
    df['Product_Category_2'] = df['Product_Category_2'].fillna(-2.0).astype('float')
    df['Product_Category_3'] = df['Product_Category_3'].fillna(-2.0).astype('float')
    
    if (train_flag == 1):
        # Remove product category 19 & 20
        condition = df.index[df['Product_Category_1'].isin([19,20])]
        df = df.drop(condition)
    
    return df

In [None]:
# https://medium.com/diogo-menezes-borges/project-3-analytics-vidhya-hackaton-black-friday-f6c6bf3da86f

### Process Age Col

In [8]:
def process_age(df):
    flag = 2
    if flag == 1:
        # split the 55+, value
        age_col = df['Age'].apply(lambda x: x.replace('+','-0'))
    
        # split the values into start & end age values
        df['Start_Age'] = age_col.apply(lambda x: x.split('-')[0])
        df['End_Age'] = age_col.apply(lambda x: x.split('-')[1])
    
        # change the datatype to int
        df['Start_Age'] = df['Start_Age'].astype('int64')
        df['End_Age'] = df['End_Age'].astype('int64')
    
        # Create a new column with the mean age value
        df['Age_Value'] = (df['Start_Age'] + df['End_Age'])/2
    
        # Replace the begining range with the ending value
        zero_age_index = df[df['Start_Age']==0].index
        df.loc[zero_age_index, 'Age_Value'] = df.loc[zero_age_index, 'End_Age']
        
        # Replace the ending range with the begining value
        end_age_index = df[df['End_Age']==0].index
        df.loc[end_age_index, 'Age_Value'] = df.loc[end_age_index, 'Start_Age']
    
    # OR
    if flag == 0:
        df = pd.get_dummies(df, prefix=['Age'], columns=['Age'], drop_first=True)
    
    if flag == 2:
        age_dict = {'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6}
        df['Age_Value'] = df['Age'].apply(lambda x: age_dict[x])
    
    return df

### Process the gender column

In [9]:
def create_label_encoder_gender(df):
    encoder = LabelEncoder()
    encoder.fit(df['Gender'])
    return encoder

In [10]:
def encode_gender(encoder, df):
    df['Gender'] = encoder.transform(df['Gender'])
    df['Gender'] = df['Gender'].astype('int64')
    return df

### Process the stay_in_current_city column

In [11]:
def process_stay_in_current_city(df):
    flag = 1
    if flag == 0:
        df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].apply(lambda x: x.replace('+',''))
        df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype('int64')
    
    if flag == 1:
        df = pd.get_dummies(df, prefix=['Stay_In_Current_City_Years'], columns=['Stay_In_Current_City_Years'],
                            drop_first=False)
    return df

### Encode the product category data

In [None]:
def product_category_encoder(df):
    encoder = ce.BinaryEncoder(cols=['Product_Category'])
    encoder.fit(df['Product_Category'])
    return encoder

In [None]:
def encode_product_category(encoder, df):
    dfbin = encoder.transform(df['Product_Category'])
    df = pd.concat([df, dfbin], axis=1)
    return df

### Encode the Occupation column

In [None]:
def occupation_encoder(df):
    encoder = ce.BinaryEncoder(cols=['Occupation'])
    encoder.fit(df['Occupation'])
    return encoder

In [None]:
def encode_occupation(encoder, df):
    dfbin = encoder.transform(df['Occupation'])
    df = pd.concat([df, dfbin], axis=1)
    return df

### Encode City column

In [12]:
def create_label_encoder_city(df):
    encoder = LabelEncoder()
    encoder.fit(df['City_Category'])
    return encoder

def encode_city(df, encoder):
    flag = 1
    if flag == 0:
        df = pd.get_dummies(df, prefix=['City'], columns=['City_Category'], drop_first=True)
    if flag == 1:
        df['City_Category'] = encoder.transform(df['City_Category'])
        df['City_Category'] = df['City_Category'].astype('int64')
    return df

## Get the frequency count of unique values for each categorical column

In [13]:
def getFrequencyCount(df, column):
    value_count_dict = {}
    # get the frequency count of each unique value of the column
    df_groups = df.groupby(column)
    for value, group in df_groups:
        value_count_dict[value] = group.shape[0]
    
    value_count_list = []
    # create a series in line with the df indexes with the count of values against each value
    for index, row in df.iterrows():
        value = row[column]
        value_count_list.append(value_count_dict.get(value, 0))
    
    return value_count_list

In [17]:
def Create_Count_Columns(df):
    df['Product_ID_Count'] = getFrequencyCount(df, 'Product_ID')
    df['Gender_Count'] = getFrequencyCount(df, 'Gender')
    df['Age_Count'] = getFrequencyCount(df, 'Age')
    df['Occupation_Count'] = getFrequencyCount(df, 'Occupation')
    df['City_Category_Count'] = getFrequencyCount(df, 'City_Category')
    df['Stay_In_Current_City_Years_Count'] = getFrequencyCount(df, 'Stay_In_Current_City_Years')
    df['Marital_Status_Count'] = getFrequencyCount(df, 'Marital_Status')
    df['Product_Category_1_Count'] = getFrequencyCount(df, 'Product_Category_1')
    df['Product_Category_2_Count'] = getFrequencyCount(df, 'Product_Category_2')
    df['Product_Category_3_Count'] = getFrequencyCount(df, 'Product_Category_3')
    return df

### Model Evaluation

#### Running ML Algorithm

In [15]:
def train_predict(train_data, target, algorithm):
    model = algorithm
    model.fit(train_data, target)
    train_prediction = model.predict(train_data)
    return train_prediction

In [16]:
def test_predict(test_data, algorithm):
    model = algorithm
    test_prediction = model.predict(test_data)
    return test_prediction

In [18]:
def model_evaluation(actuals, predictions, algorithm):    
    # Mean absolute error
    mae = round(metrics.mean_absolute_error(actuals, predictions),2)
    print('Mean Absolute Error: {}'.format(mae))
    
    # Mean Squared Error
    mse = round(metrics.mean_squared_error(actuals, predictions),2)
    print('Mean Squared Error: {}'.format(mse))
    
    # Root Mean Square Error
    rmse = round(np.sqrt(mse),2)
    print('Root Mean Squared Error: {}'.format(rmse))
    
    # R2 score
    r2_score = round(metrics.r2_score(actuals, predictions),2)
    print('R2 score: {}'.format(r2_score))
    
    alg = algorithm
    df = pd.DataFrame(data=[{'Algorithm': alg, 'MAE':mae, 'MSE':mse, 'RMSE':rmse, 'R2_score':r2_score}])
        
    return df[['Algorithm', 'MAE', 'MSE', 'RMSE', 'R2_score']]

In [19]:
results = pd.DataFrame(columns=['Algorithm','MAE', 'MSE', 'RMSE', 'R2_score'])

## Preparing the train data

In [71]:
# Process the Product category column
data_train = process_product_category(data_train, 1)
data_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
139976,1003621,P00111142,M,18-25,4,B,3,0,1,15.0,16.0,11888
451399,1003539,P00137642,F,26-35,4,A,1,1,3,4.0,12.0,7915
50723,1001749,P00273742,M,26-35,2,B,1,1,1,17.0,-2.0,11875
161113,1000936,P00307442,M,18-25,4,C,4+,0,5,8.0,-2.0,3727
537190,1004674,P00196542,F,36-45,7,C,1,1,5,8.0,14.0,5172


In [72]:
# Create Frequency Count Columns
data_train = Create_Count_Columns(data_train)
data_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,...,Product_ID_Count,Gender_Count,Age_Count,Occupation_Count,City_Category_Count,Stay_In_Current_City_Years_Count,Marital_Status_Count,Product_Category_1_Count,Product_Category_2_Count,Product_Category_3_Count
139976,1003621,P00111142,M,18-25,4,B,3,0,1,15.0,...,822,329092,79157,57359,184221,75710,257752,112210,30209,26134
451399,1003539,P00137642,F,26-35,4,A,1,1,3,4.0,...,22,107663,174320,57359,117542,153693,179003,16202,20542,7410
50723,1001749,P00273742,M,26-35,2,B,1,1,1,17.0,...,307,329092,174320,21210,184221,153693,179003,112210,10724,303503
161113,1000936,P00307442,M,18-25,4,C,4+,0,5,8.0,...,80,329092,79157,57359,134992,67192,257752,120708,51233,303503
537190,1004674,P00196542,F,36-45,7,C,1,1,5,8.0,...,301,107663,87569,46984,134992,153693,179003,120708,51233,14717


In [73]:
# Encode the Gender column
gender_encoder = create_label_encoder_gender(data_train)
data_train = encode_gender(gender_encoder, data_train)
#data_train.head(10)

In [74]:
# Process the Age column
data_train = process_age(data_train)
data_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,...,Gender_Count,Age_Count,Occupation_Count,City_Category_Count,Stay_In_Current_City_Years_Count,Marital_Status_Count,Product_Category_1_Count,Product_Category_2_Count,Product_Category_3_Count,Age_Value
139976,1003621,P00111142,1,18-25,4,B,3,0,1,15.0,...,329092,79157,57359,184221,75710,257752,112210,30209,26134,1
451399,1003539,P00137642,0,26-35,4,A,1,1,3,4.0,...,107663,174320,57359,117542,153693,179003,16202,20542,7410,2
50723,1001749,P00273742,1,26-35,2,B,1,1,1,17.0,...,329092,174320,21210,184221,153693,179003,112210,10724,303503,2
161113,1000936,P00307442,1,18-25,4,C,4+,0,5,8.0,...,329092,79157,57359,134992,67192,257752,120708,51233,303503,1
537190,1004674,P00196542,0,36-45,7,C,1,1,5,8.0,...,107663,87569,46984,134992,153693,179003,120708,51233,14717,3


In [None]:
# Encode Occupation column
#occupation_encoder = occupation_encoder(data_train)
#data_train = encode_occupation(occupation_encoder, data_train)
#data_train.head()

In [75]:
# Encode City cateogry column
city_encoder = create_label_encoder_city(data_train)
data_train = encode_city(data_train, city_encoder)
data_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,...,Gender_Count,Age_Count,Occupation_Count,City_Category_Count,Stay_In_Current_City_Years_Count,Marital_Status_Count,Product_Category_1_Count,Product_Category_2_Count,Product_Category_3_Count,Age_Value
139976,1003621,P00111142,1,18-25,4,1,3,0,1,15.0,...,329092,79157,57359,184221,75710,257752,112210,30209,26134,1
451399,1003539,P00137642,0,26-35,4,0,1,1,3,4.0,...,107663,174320,57359,117542,153693,179003,16202,20542,7410,2
50723,1001749,P00273742,1,26-35,2,1,1,1,1,17.0,...,329092,174320,21210,184221,153693,179003,112210,10724,303503,2
161113,1000936,P00307442,1,18-25,4,2,4+,0,5,8.0,...,329092,79157,57359,134992,67192,257752,120708,51233,303503,1
537190,1004674,P00196542,0,36-45,7,2,1,1,5,8.0,...,107663,87569,46984,134992,153693,179003,120708,51233,14717,3


In [76]:
# Process Stay_In_Currenty_City_Years
data_train = process_stay_in_current_city(data_train)
data_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,...,Marital_Status_Count,Product_Category_1_Count,Product_Category_2_Count,Product_Category_3_Count,Age_Value,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
139976,1003621,P00111142,1,18-25,4,1,0,1,15.0,16.0,...,257752,112210,30209,26134,1,0,0,0,1,0
451399,1003539,P00137642,0,26-35,4,0,1,3,4.0,12.0,...,179003,16202,20542,7410,2,0,1,0,0,0
50723,1001749,P00273742,1,26-35,2,1,1,1,17.0,-2.0,...,179003,112210,10724,303503,2,0,1,0,0,0
161113,1000936,P00307442,1,18-25,4,2,0,5,8.0,-2.0,...,257752,120708,51233,303503,1,0,0,0,0,1
537190,1004674,P00196542,0,36-45,7,2,1,5,8.0,14.0,...,179003,120708,51233,14717,3,0,1,0,0,0


In [None]:
# Encode the product category column
#pc_encoder = product_category_encoder(data_train)
#data_train = encode_product_category(pc_encoder, data_train)
#data_train.head()

In [26]:
data_train.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Marital_Status', 'Product_Category_1', 'Product_Category_2',
       'Product_Category_3', 'Purchase', 'Product_ID_Count', 'Gender_Count',
       'Age_Count', 'Occupation_Count', 'City_Category_Count',
       'Stay_In_Current_City_Years_Count', 'Marital_Status_Count',
       'Product_Category_1_Count', 'Product_Category_2_Count',
       'Product_Category_3_Count', 'Age_Value', 'Stay_In_Current_City_Years_0',
       'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
       'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4+'],
      dtype='object')

In [28]:

columns = ['Gender', 'Age_Value', 'Occupation', 'City_Category', 'Marital_Status', 'Product_Category_1',
           'Product_Category_2', 'Product_Category_3', 'Purchase','Stay_In_Current_City_Years_0',
           'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2','Stay_In_Current_City_Years_3',
           'Stay_In_Current_City_Years_4+', 'Age_Count', 'Occupation_Count','Product_Category_1_Count',
           'Product_Category_2_Count', 'Product_Category_3_Count','Product_ID_Count']
'''
columns = ['Gender', 'Age_18-25','Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55', 'Age_55+',
           'Stay_In_Current_City_Years', 'Marital_Status',
           'Product_Category_0', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3',
           'Product_Category_4', 'Product_Category_5', 'Product_Category_6', 'Product_Category_7',
           'Product_Category_8', 'Occupation_0', 'Occupation_1', 'Occupation_2', 'Occupation_3',
           'Occupation_4', 'Occupation_5', 'City_B', 'City_C', 'Purchase']
'''

"\ncolumns = ['Gender', 'Age_18-25','Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55', 'Age_55+',\n           'Stay_In_Current_City_Years', 'Marital_Status',\n           'Product_Category_0', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3',\n           'Product_Category_4', 'Product_Category_5', 'Product_Category_6', 'Product_Category_7',\n           'Product_Category_8', 'Occupation_0', 'Occupation_1', 'Occupation_2', 'Occupation_3',\n           'Occupation_4', 'Occupation_5', 'City_B', 'City_C', 'Purchase']\n"

In [77]:
data_train = data_train[columns]

In [78]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 436755 entries, 139976 to 537345
Data columns (total 20 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Gender                         436755 non-null  int64  
 1   Age_Value                      436755 non-null  int64  
 2   Occupation                     436755 non-null  int64  
 3   City_Category                  436755 non-null  int64  
 4   Marital_Status                 436755 non-null  int64  
 5   Product_Category_1             436755 non-null  int64  
 6   Product_Category_2             436755 non-null  float64
 7   Product_Category_3             436755 non-null  float64
 8   Purchase                       436755 non-null  int64  
 9   Stay_In_Current_City_Years_0   436755 non-null  uint8  
 10  Stay_In_Current_City_Years_1   436755 non-null  uint8  
 11  Stay_In_Current_City_Years_2   436755 non-null  uint8  
 12  Stay_In_Current_City_Year

In [31]:
data_train.isnull().sum().any()

False

### Feature Scaling

In [79]:
# Create the scaler object
def scaler_object(df, num_columns):
    df1 = df.loc[:, num_columns].values
    sc = StandardScaler()
    sc.fit(df1)
    return sc

# Scale the data
def scale_data(scaler, df, cat_columns, num_columns):
    df1 = df.loc[:, cat_columns].values
    df2 = df.loc[:, num_columns].values
    scaler.transform(df2)
    df_scaled = np.concatenate((df1, df2), axis=1)
    return df_scaled

#### Scale the train dataset

In [80]:
cat_columns = []

num_columns = ['Gender', 'Age_Value', 'Occupation', 'City_Category', 'Marital_Status', 'Product_Category_1',
           'Product_Category_2', 'Product_Category_3','Stay_In_Current_City_Years_0',
           'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2','Stay_In_Current_City_Years_3',
           'Stay_In_Current_City_Years_4+', 'Age_Count', 'Occupation_Count','Product_Category_1_Count',
           'Product_Category_2_Count', 'Product_Category_3_Count','Product_ID_Count']

scaler = scaler_object(data_train, num_columns)
data_train_scaled = scale_data(scaler, data_train, cat_columns, num_columns)
print(data_train_scaled.shape)

(436755, 19)


In [None]:
data_train_scaled

# Prepare dataset for ML algorithm

### Prepare the train dataset

In [81]:
X_train = data_train.loc[:, data_train.columns!='Purchase']
y_train = data_train.loc[:, data_train.columns=='Purchase']
X_train_scaled = data_train_scaled
y_train_scaled = data_train.loc[:, data_train.columns=='Purchase'].values
print(X_train.shape)
print(y_train.shape)
print(X_train_scaled.shape)
print(y_train_scaled.shape)

(436755, 19)
(436755, 1)
(436755, 19)
(436755, 1)


### Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression

In [None]:
X = X_train_scaled
y = y_train_scaled.ravel()

test = SelectKBest(score_func=f_regression, k=10)
fit = test.fit(X, y)

In [None]:
# Summarizing the scores
np.set_printoptions(precision=3)
print(fit.scores_)

In [None]:
# Creating list of feature names
feat_names = data_train.columns[data_train.columns!='Purchase']
feat_names

In [None]:
# Sorting the sores by descending order
for score, name in sorted(zip(fit.scores_, feat_names), reverse=True):
    print('Feature Score of', name, ':', round(score,2))

In [66]:
data_train.to_csv('mod_train.csv')

#### Re-work the columns to be included for ML

In [None]:
columns = ['Gender', 'Marital_Status', 'Age_18-25','Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55', 'Age_55+',
           'Product_Category_0', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3',
           'Product_Category_4', 'Product_Category_5', 'Product_Category_6', 'Product_Category_7',
           'Product_Category_8', 'Occupation_0', 'Occupation_1', 'Occupation_2', 'Occupation_3',
           'Occupation_4', 'Occupation_5', 'City_B', 'City_C', 'Purchase']
data_train = data_train[columns]

cat_columns = ['Gender','Age_18-25','Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55', 'Age_55+',
               'Marital_Status', 'Product_Category_0', 'Product_Category_1', 'Product_Category_2', 
               'Product_Category_3','Product_Category_4', 'Product_Category_5', 'Product_Category_6',
               'Product_Category_7', 'Product_Category_8', 'Occupation_0', 'Occupation_1', 'Occupation_2', 
               'Occupation_3', 'Occupation_4', 'Occupation_5', 'City_B', 'City_C']
num_columns = []

scaler = scaler_object(data_train, cat_columns)
data_train_scaled = scale_data(scaler, data_train, cat_columns, num_columns)
print(data_train_scaled.shape)

X_train = data_train.loc[:, data_train.columns!='Purchase']
y_train = data_train.loc[:, data_train.columns=='Purchase']
X_train_scaled = data_train_scaled
y_train_scaled = data_train.loc[:, data_train.columns=='Purchase'].values
print(X_train.shape)
print(y_train.shape)
print(X_train_scaled.shape)
print(y_train_scaled.shape)

# ML Algorithm

In [82]:
linreg = LinearRegression()
y_pred_train = train_predict(X_train_scaled, y_train_scaled, linreg)
results_linreg = model_evaluation(y_train_scaled, y_pred_train, 'Linear Regression Train')
results = results.append(results_linreg, ignore_index=True)
results

Mean Absolute Error: 3289.85
Mean Squared Error: 18829831.88
Root Mean Squared Error: 4339.34
R2 score: 0.24


Unnamed: 0,Algorithm,MAE,MSE,RMSE,R2_score
0,Linear Regression Train,3288.6,18948466.11,4352.98,0.25
1,Dtree Regression Train,318.99,897932.44,947.59,0.96
2,Linear Regression Test,3398.28,20729899.88,4553.01,0.17
3,Dtree Regression Test,129.71,339708.94,582.85,0.99
4,Dtree Regression Train,1993.15,7187414.49,2680.94,0.72
5,Linear Regression Test,3398.28,20729899.88,4553.01,0.17
6,Dtree Regression Test,2013.19,7318857.42,2705.34,0.71
7,Linear Regression Train,3289.85,18829831.88,4339.34,0.24


In [83]:
dtree = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)
y_pred_train = train_predict(X_train, y_train, dtree)
results_dtree = model_evaluation(y_train, y_pred_train, 'Dtree Regression Train')
results = results.append(results_dtree, ignore_index=True)
results

Mean Absolute Error: 2008.64
Mean Squared Error: 7280162.74
Root Mean Squared Error: 2698.18
R2 score: 0.71


Unnamed: 0,Algorithm,MAE,MSE,RMSE,R2_score
0,Linear Regression Train,3288.6,18948466.11,4352.98,0.25
1,Dtree Regression Train,318.99,897932.44,947.59,0.96
2,Linear Regression Test,3398.28,20729899.88,4553.01,0.17
3,Dtree Regression Test,129.71,339708.94,582.85,0.99
4,Dtree Regression Train,1993.15,7187414.49,2680.94,0.72
5,Linear Regression Test,3398.28,20729899.88,4553.01,0.17
6,Dtree Regression Test,2013.19,7318857.42,2705.34,0.71
7,Linear Regression Train,3289.85,18829831.88,4339.34,0.24
8,Dtree Regression Train,2008.64,7280162.74,2698.18,0.71


# Prepare the test data

In [98]:
with open('test_data.pickle', 'rb') as f:
    data_test = pickle.load(f)

print("Test dataset size: ",data_test.shape)

Test dataset size:  (110014, 12)


## Prepare the test data

In [99]:
# Product Catgory
data_test = process_product_category(data_test, 0)
#data_test = encode_product_category(pc_encoder, data_test)

# Frequency count
data_test = Create_Count_Columns(data_test)

# Gender Column
data_test = encode_gender(gender_encoder, data_test)

# Age column
data_test = process_age(data_test)

# Occupation column
#data_test = encode_occupation(occupation_encoder, data_test)

# City Column
city_encoder = create_label_encoder_city(data_test)
data_test = encode_city(data_test, city_encoder)

# Stay_In_Current_City_Years
data_test = process_stay_in_current_city(data_test)

data_test = data_test[columns]
data_test.head()

Unnamed: 0,Gender,Age_Value,Occupation,City_Category,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+,Age_Count,Occupation_Count,Product_Category_1_Count,Product_Category_2_Count,Product_Category_3_Count,Product_ID_Count
478650,1,1,15,2,0,1,11.0,15.0,11444,0,0,1,0,0,19895,2383,28168,2815,5617,228
177786,1,1,17,0,0,1,5.0,18.0,15870,0,0,1,0,0,19895,8086,28168,5210,934,85
424024,1,2,20,1,0,5,8.0,14.0,5145,0,0,1,0,0,44120,6577,30225,12855,3711,108
77196,1,2,20,0,0,5,-2.0,-2.0,6891,0,0,1,0,0,44120,6577,30225,34635,76445,37
431394,0,4,1,1,1,5,-2.0,-2.0,1741,1,0,0,0,0,9187,9348,30225,34635,76445,163


#### Scale the test dataset

In [100]:
data_test_scaled = scale_data(scaler, data_test, cat_columns, num_columns)
print(data_test_scaled.shape)

(110014, 19)


## Prepare the test dataset for prediction

In [101]:
X_test = data_test.loc[:, data_test.columns!='Purchase']
y_test = data_test.loc[:, data_test.columns=='Purchase']
X_test_scaled = data_test_scaled
y_test_scaled = data_test.loc[:, data_test.columns=='Purchase'].values
print(X_test.shape)
print(y_test.shape)
print(X_test_scaled.shape)
print(y_test_scaled.shape)

(110014, 19)
(110014, 1)
(110014, 19)
(110014, 1)


### Predict for test data

In [102]:
# Linear Regression
y_pred_test = test_predict(X_test_scaled, linreg)
results_linreg = model_evaluation(y_test_scaled, y_pred_test, 'Linear Regression Test')
results = results.append(results_linreg, ignore_index=True)
results

Mean Absolute Error: 3407.21
Mean Squared Error: 20573908.59
Root Mean Squared Error: 4535.85
R2 score: 0.18


Unnamed: 0,Algorithm,MAE,MSE,RMSE,R2_score
0,Linear Regression Train,3288.6,18948466.11,4352.98,0.25
1,Dtree Regression Train,318.99,897932.44,947.59,0.96
2,Linear Regression Test,3398.28,20729899.88,4553.01,0.17
3,Dtree Regression Test,129.71,339708.94,582.85,0.99
4,Dtree Regression Train,1993.15,7187414.49,2680.94,0.72
5,Linear Regression Test,3398.28,20729899.88,4553.01,0.17
6,Dtree Regression Test,2013.19,7318857.42,2705.34,0.71
7,Linear Regression Train,3289.85,18829831.88,4339.34,0.24
8,Dtree Regression Train,2008.64,7280162.74,2698.18,0.71
9,Linear Regression Test,3416.51,20697971.67,4549.5,0.17


In [103]:
# Decision Tree regression
y_pred_test = train_predict(X_test, y_test, dtree)
results_dtree = model_evaluation(y_test, y_pred_test, 'Dtree Regression Test')
results = results.append(results_dtree, ignore_index=True)
results

Mean Absolute Error: 2017.58
Mean Squared Error: 7390354.86
Root Mean Squared Error: 2718.52
R2 score: 0.71


Unnamed: 0,Algorithm,MAE,MSE,RMSE,R2_score
0,Linear Regression Train,3288.6,18948466.11,4352.98,0.25
1,Dtree Regression Train,318.99,897932.44,947.59,0.96
2,Linear Regression Test,3398.28,20729899.88,4553.01,0.17
3,Dtree Regression Test,129.71,339708.94,582.85,0.99
4,Dtree Regression Train,1993.15,7187414.49,2680.94,0.72
5,Linear Regression Test,3398.28,20729899.88,4553.01,0.17
6,Dtree Regression Test,2013.19,7318857.42,2705.34,0.71
7,Linear Regression Train,3289.85,18829831.88,4339.34,0.24
8,Dtree Regression Train,2008.64,7280162.74,2698.18,0.71
9,Linear Regression Test,3416.51,20697971.67,4549.5,0.17


# Predicting the Actual Test data

In [111]:
df_actual_test = pd.read_csv('test_HujdGe7/test.csv')
df_actual_test.shape

(233599, 11)

### Prepare the actual test data

In [105]:
df_test = df_actual_test.copy()

# Product Catgory
df_test = process_product_category(df_test, 0)
#df_test = encode_product_category(pc_encoder, df_test)


# Frequency count
df_test = Create_Count_Columns(df_test)

# Gender Column
df_test = encode_gender(gender_encoder, df_test)

# Age column
df_test = process_age(df_test)

# Occupation column
#df_test = encode_occupation(occupation_encoder, df_test)

# City Column
city_encoder = create_label_encoder_city(df_test)
df_test = encode_city(df_test, city_encoder)
#df_test = encode_city(df_test)

# Stay_In_Current_City_Years
df_test = process_stay_in_current_city(df_test)

# Product Catgory
#df_test = process_product_category(df_test)
#df_test = encode_product_category(pc_encoder, df_test)

df_columns = columns.copy()
df_columns.remove('Purchase')

df_test = df_test[df_columns]
df_test.head()

Unnamed: 0,Gender,Age_Value,Occupation,City_Category,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+,Age_Count,Occupation_Count,Product_Category_1_Count,Product_Category_2_Count,Product_Category_3_Count,Product_ID_Count
0,1,4,7,1,1,1,11.0,-2.0,0,0,1,0,0,19577,24994,60321,6096,162562,397
1,1,2,17,2,0,3,5.0,-2.0,1,0,0,0,0,93428,17375,8578,10930,162562,117
2,0,3,1,1,1,5,14.0,-2.0,0,0,0,0,1,46711,20261,65017,23726,162562,75
3,0,3,1,1,1,4,9.0,-2.0,0,0,0,0,1,46711,20261,5003,2484,162562,8
4,0,2,1,2,0,4,5.0,12.0,0,1,0,0,0,93428,20261,5003,10930,3869,214


In [106]:
df_test.shape

(233599, 19)

#### Scale the dataset

In [107]:
df_test_scaled = scale_data(scaler, df_test, cat_columns, num_columns)
print(df_test_scaled.shape)

(233599, 19)


### Prepare the prediction for upload

In [108]:
y_pred_act_test = test_predict(df_test_scaled, linreg)
y_pred_act_test = y_pred_act_test.reshape(-1,1)
prediction_arr = np.concatenate((y_pred_act_test, df_actual_test[['User_ID', 'Product_ID']].values), axis=1)
predictions_linreg = pd.DataFrame(prediction_arr, columns=['Purchase', 'User_ID', 'Product_ID'])

In [109]:
y_pred_act_test = test_predict(df_test, dtree)
y_pred_act_test = y_pred_act_test.reshape(-1,1)
prediction_arr = np.concatenate((y_pred_act_test, df_actual_test[['User_ID', 'Product_ID']].values), axis=1)
predictions_dtree = pd.DataFrame(prediction_arr, columns=['Purchase', 'User_ID', 'Product_ID'])

In [110]:
predictions_dtree.to_csv('Predictions.csv', index=False)