In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
df = pd.read_csv('BlackFriday.csv')
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [3]:
df.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [4]:
prod_cat = df[['Product_Category_1', 'Product_Category_2', 'Product_Category_3']]
print(df.shape[0])
prod_cat.isnull().sum()

537577


Product_Category_1         0
Product_Category_2    166986
Product_Category_3    373299
dtype: int64

In [5]:
df = df.drop(['Product_Category_3', 'Product_Category_2'], axis=1)

In [8]:
#need dummies for gender, age, city, stay in current city, 
gender = pd.get_dummies(df['Gender'])
age = pd.get_dummies(df['Age'])
occupation = pd.get_dummies(df['Occupation'], prefix='Occupation')
city = pd.get_dummies(df['City_Category'], prefix='City')
stay = pd.get_dummies(df['Stay_In_Current_City_Years'], prefix='Years_Stayed')
prod_cat1 = pd.get_dummies(df['Product_Category_1'], prefix='Primary_category')

In [60]:
X = df['Marital_Status']
Y = df['Product_Category_1'].astype('category')
X = pd.concat([age, gender, occupation, city, stay, X, df['Purchase']], axis=1)
#predict product category bawsed on purchase info and sociological factors

In [14]:
X.head()

Unnamed: 0,0-17,18-25,26-35,36-45,46-50,51-55,55+,F,M,Occupation_0,...,City_A,City_B,City_C,Years_Stayed_0,Years_Stayed_1,Years_Stayed_2,Years_Stayed_3,Years_Stayed_4+,Marital_Status,Purchase
0,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,8370
1,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,15200
2,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,1422
3,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,1057
4,0,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,0,0,1,0,7969


# Neural net

In [71]:
len(X.columns)

40

In [61]:
from sklearn.neural_network import MLPClassifier

neural_classifier = MLPClassifier(hidden_layer_sizes=(50), 
                                 alpha=.01, 
                                 learning_rate='adaptive', 
                                 max_iter=500, 
                                 random_state=68,
                                 tol=.0001,
                                 early_stopping=True)

In [62]:
neural_classifier.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=50, learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=68, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [63]:
cross_val_score(neural_classifier, X, Y, cv=5)

array([0.47520949, 0.48055729, 0.50738037, 0.48533187, 0.49327492])

## Evolution of NN model
__alpha__ = 2, __learning_rate__ = adaptive
* Ver 1: hidden layers size: (10); CV: array([0.5357093 , 0.49383841, 0.47368274, 0.53736327, 0.470951  ])

* Ver 2: hidden layers size: (25); CV: array([0.4113615 , 0.27640182, 0.38413245, 0.37940881, 0.42611713])

* Ver 3: hidden layers size: (50); CV: array([0.44824732, 0.35677682, 0.45937776, 0.45763264, 0.49272612]) - good candidate

* Ver 4: hidden layers size: (75); CV: array([0.38360878, 0.47551641, 0.40499465, 0.48240196, 0.44724114])

* Ver 5: hidden layers size: (100); CV: array([0.46341645, 0.27644832, 0.45683858, 0.27896421, 0.36707036])


hidden layer size=50
* Ver 1 alpha(20) : array([0.49483357, 0.49674947, 0.49259173, 0.28806087, 0.36267999])

* ver2 alpha (50) : array([0.55862576, 0.3728946 , 0.4769288 , 0.5339497 , 0.44159504])

* ver2 alpha (.0001) : array([0.48296612, 0.49038792, 0.43652514, 0.48746186, 0.48042936])

* ver2 alpha (.01) : array([0.47520949, 0.48055729, 0.50738037, 0.48533187, 0.49327492])

# Random Forest comparison

In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3)

In [75]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=6, max_features=20)

In [76]:
rfc.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features=20, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [77]:
cross_val_score(rfc, X, Y, cv=10)

array([0.69435506, 0.69740538, 0.69694208, 0.70524554, 0.70428587,
       0.69419228, 0.699401  , 0.6992912 , 0.70179714, 0.69994419])

## Random forest iterations
__trees__: 100, depth=4
* max_feats=sqrt: [0.48883102, 0.49061657, 0.49354563, 0.4890067 , 0.49118271, 0.49192648, 0.49049408, 0.48784254, 0.4910887 , 0.48898605]

* max_feats=20: [0.58173533, 0.5827397 , 0.59134705, 0.59038318, 0.5780163, 0.62056329, 0.58506957, 0.5777724 , 0.58183882, 0.58055814])

* depth=6: [0.69435506, 0.69740538, 0.69694208, 0.70524554, 0.70428587, 0.69419228, 0.699401  , 0.6992912 ,0.70179714, 0.69994419]