# Black Friday Sales Prediction

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

### Import the Data

In [2]:
train = pd.read_csv('/DATA SETS/Black Friday/train.csv')
test = pd.read_csv('/DATA SETS/Black Friday/test.csv')

train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


### Taking an Overview of the data 

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [4]:
train.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [5]:
#Checking for unique values

print('The number of unique Users are:',train['User_ID'].nunique())
print('The number of unique Products are:',train['Product_ID'].nunique())

The number of unique Users are: 5891
The number of unique Products are: 3631


### Create a copy of the data to work on.

In [6]:
#For Train Data
data = train.copy()
data = data.drop(columns = ['Marital_Status'])

#For Test Data
data2 = test.copy()
data2 = data2.drop(columns = ['Marital_Status'])


### Dealing with the Null values in our data.

In [7]:
#For Train Data

# First do One Hot encoding for Product Category 1
df_oneHot = pd.get_dummies(data, columns = ['Product_Category_1'], prefix = ['P'])
#Fill NaN values with Zeros
df_oneHot = df_oneHot.fillna(0)
for i in range(1, 15):
    df_oneHot.loc[df_oneHot.Product_Category_2 == i,'P_'+ str(i)]= 1
    df_oneHot.loc[df_oneHot.Product_Category_3 == i,'P_'+ str(i)]= 1

#For Test Data    

# First do One Hot encoding for Product Category 1
df2_oneHot = pd.get_dummies(data2, columns = ['Product_Category_1'], prefix = ['P'])
#Fill NaN values with Zeros
df2_oneHot = df2_oneHot.fillna(0)
for i in range(1, 15):
    df2_oneHot.loc[df2_oneHot.Product_Category_2 == i,'P_'+ str(i)]= 1
    df2_oneHot.loc[df2_oneHot.Product_Category_3 == i,'P_'+ str(i)]= 1


### Two Columns missing in test data, so we add it manually. 

In [8]:
df2_oneHot['P_19'] = np.zeros([233599,1])
df2_oneHot['P_20'] = np.zeros([233599,1])

### Drop unnecessary columns

In [9]:
#For train data
df_oneHot = df_oneHot.drop(columns = ['Product_Category_2', 'Product_Category_3'])

#For test data
df2_oneHot = df2_oneHot.drop(columns = ['Product_Category_2', 'Product_Category_3'])


### Making Product ID usable for the model

In [10]:
#For train Data
df_oneHot['Product_ID'] = df_oneHot['Product_ID'].str.replace('P00', '')

#For Test data
df2_oneHot['Product_ID'] = df2_oneHot['Product_ID'].str.replace('P00', '')

### Separarting Target and Features

In [11]:
target = data.Purchase
df_oneHot = df_oneHot.drop(columns = ['Purchase'])

### Encode Gender Column 

In [12]:
#For train Data
df_oneHot = df_oneHot.replace({'Gender': {'M': 1, 'F':0}})

#For test Data
df2_oneHot = df2_oneHot.replace({'Gender': {'M': 1, 'F':0}})

### Encoding rest of the categorical variables

In [13]:
#For train Data
data_df_onehot = pd.get_dummies(df_oneHot, columns=['Age',"Occupation", 
                                               'City_Category','Stay_In_Current_City_Years'], 
                                 prefix = ['Age',"Occupation", 
                                               'City','Stay'])


#For test Data
data2_df_onehot = pd.get_dummies(df2_oneHot, columns=['Age',"Occupation", 
                                               'City_Category','Stay_In_Current_City_Years'], 
                                 prefix = ['Age',"Occupation", 
                                               'City','Stay'])


### Scaling Data for effective model  building

In [14]:
scaler = StandardScaler()

#For Train Data
data_df_onehot['Product_ID'] = scaler.fit_transform(data_df_onehot['Product_ID'].values.reshape(-1, 1))
data_df_onehot['User_ID'] = scaler.transform(data_df_onehot['User_ID'].values.reshape(-1, 1))

#For Test Data
data2_df_onehot['Product_ID'] = scaler.transform(data2_df_onehot['Product_ID'].values.reshape(-1, 1))
data2_df_onehot['User_ID'] = scaler.transform(data2_df_onehot['User_ID'].values.reshape(-1, 1))


### Splitting Data into test and train dataset

In [15]:
data2_df_onehot.shape
train_data, test_data, train_labels, test_labels = train_test_split(data_df_onehot, target, test_size=0.2, random_state=42)


### Converting Data into DMatrix for XGBoost 

In [16]:
#For Train Data
dtrain = xgb.DMatrix(train_data, label=train_labels)
dtest = xgb.DMatrix(test_data, label=test_labels)

#For Test Data
dpred = xgb.DMatrix(data2_df_onehot)

### Initializing the basic parameter dictionary

In [17]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}


params['eval_metric'] = "rmse"
num_boost_round = 999

### Basic Model 

In [18]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
    
)

[0]	Test-rmse:7753.65625
Will train until Test-rmse hasn't improved in 10 rounds.
[1]	Test-rmse:5930.35205
[2]	Test-rmse:4769.00244
[3]	Test-rmse:4027.15063
[4]	Test-rmse:3586.85400
[5]	Test-rmse:3348.09839
[6]	Test-rmse:3185.80688
[7]	Test-rmse:3085.66431
[8]	Test-rmse:3035.15942
[9]	Test-rmse:3002.64429
[10]	Test-rmse:2969.64380
[11]	Test-rmse:2954.43018
[12]	Test-rmse:2940.31592
[13]	Test-rmse:2928.56982
[14]	Test-rmse:2923.85156
[15]	Test-rmse:2912.66113
[16]	Test-rmse:2908.83594
[17]	Test-rmse:2900.41846
[18]	Test-rmse:2896.76709
[19]	Test-rmse:2874.67773
[20]	Test-rmse:2870.76929
[21]	Test-rmse:2866.29443
[22]	Test-rmse:2861.17773
[23]	Test-rmse:2860.06372
[24]	Test-rmse:2851.43481
[25]	Test-rmse:2850.10791
[26]	Test-rmse:2842.15869
[27]	Test-rmse:2836.49390
[28]	Test-rmse:2832.87061
[29]	Test-rmse:2831.68188
[30]	Test-rmse:2825.16406
[31]	Test-rmse:2821.50781
[32]	Test-rmse:2818.22144
[33]	Test-rmse:2813.50757
[34]	Test-rmse:2809.32959
[35]	Test-rmse:2808.22412
[36]	Test-rmse:28

[306]	Test-rmse:2559.48071
[307]	Test-rmse:2559.15845
[308]	Test-rmse:2559.02417
[309]	Test-rmse:2558.67456
[310]	Test-rmse:2557.94775
[311]	Test-rmse:2557.97192
[312]	Test-rmse:2557.73975
[313]	Test-rmse:2557.64819
[314]	Test-rmse:2557.68970
[315]	Test-rmse:2557.48755
[316]	Test-rmse:2557.52270
[317]	Test-rmse:2557.03369
[318]	Test-rmse:2556.60034
[319]	Test-rmse:2556.49951
[320]	Test-rmse:2556.16455
[321]	Test-rmse:2555.59863
[322]	Test-rmse:2555.44800
[323]	Test-rmse:2555.47778
[324]	Test-rmse:2554.99536
[325]	Test-rmse:2554.66772
[326]	Test-rmse:2554.58569
[327]	Test-rmse:2554.47852
[328]	Test-rmse:2554.38208
[329]	Test-rmse:2554.16553
[330]	Test-rmse:2554.01733
[331]	Test-rmse:2553.97876
[332]	Test-rmse:2553.73218
[333]	Test-rmse:2553.72876
[334]	Test-rmse:2553.51392
[335]	Test-rmse:2553.28442
[336]	Test-rmse:2552.99512
[337]	Test-rmse:2552.59375
[338]	Test-rmse:2552.42895
[339]	Test-rmse:2552.25830
[340]	Test-rmse:2552.02319
[341]	Test-rmse:2551.92651
[342]	Test-rmse:2551.81445
[

[610]	Test-rmse:2519.82324
[611]	Test-rmse:2519.79248
[612]	Test-rmse:2519.76343
[613]	Test-rmse:2519.65869
[614]	Test-rmse:2519.67969
[615]	Test-rmse:2519.63769
[616]	Test-rmse:2519.68408
[617]	Test-rmse:2519.59790
[618]	Test-rmse:2519.55786
[619]	Test-rmse:2519.51343
[620]	Test-rmse:2519.41382
[621]	Test-rmse:2519.51001
[622]	Test-rmse:2519.31274
[623]	Test-rmse:2519.27832
[624]	Test-rmse:2519.19580
[625]	Test-rmse:2519.07837
[626]	Test-rmse:2518.85132
[627]	Test-rmse:2518.57666
[628]	Test-rmse:2518.52319
[629]	Test-rmse:2518.55786
[630]	Test-rmse:2518.61035
[631]	Test-rmse:2518.42627
[632]	Test-rmse:2518.31812
[633]	Test-rmse:2518.18140
[634]	Test-rmse:2518.09668
[635]	Test-rmse:2518.03369
[636]	Test-rmse:2517.95850
[637]	Test-rmse:2517.93677
[638]	Test-rmse:2517.86108
[639]	Test-rmse:2517.57690
[640]	Test-rmse:2517.49805
[641]	Test-rmse:2517.46875
[642]	Test-rmse:2517.44141
[643]	Test-rmse:2517.33276
[644]	Test-rmse:2517.33325
[645]	Test-rmse:2517.30566
[646]	Test-rmse:2517.26758
[

## Hyperparameter Tuning

### 1. Maximum Depth and Minimum Child Weight 

In [19]:
#Takes a lot of time


#Select a range of values for different parameters
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8) #TRY GREATER VALUES > 60
]
#Initialize minimum rmse and the best parameters
min_rmse  = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=5,
        verbose_eval = True
    )
    # Update best RMSE
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)

In [20]:
params['max_depth'] = 9
params['min_child_weight'] = 7

### 2. Subsample and Colsample_bytree 

In [21]:
#Takes a lot of time


#Select a range of values for different parameters
gridsearch_params = [(subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]
#Initialize minimum rmse and the best parameters
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)

In [22]:
params['subsample'] = 1
params['colsample_bytree'] = 0.7

### 3. ETA (Learning Rate) 

In [23]:
#Takes a lot of time


min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['rmse'],
            early_stopping_rounds=10
          )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}".format(best_params))

### Finalized Parameter Dictionary

In [25]:
params = {'colsample_bytree': 0.7,
 'eta': 0.2,
 'eval_metric': 'rmse',
 'max_depth': 9,
 'min_child_weight': 66,
 'objective': 'reg:squarederror',
 'subsample': 1}

### Training Tuned Model

In [26]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:8656.07715
Will train until Test-rmse hasn't improved in 10 rounds.
[1]	Test-rmse:7285.44141
[2]	Test-rmse:6153.22754
[3]	Test-rmse:5282.83594
[4]	Test-rmse:4647.38623
[5]	Test-rmse:4171.44238
[6]	Test-rmse:3781.28027
[7]	Test-rmse:3524.07202
[8]	Test-rmse:3331.38403
[9]	Test-rmse:3211.36353
[10]	Test-rmse:3126.53491
[11]	Test-rmse:3042.86768
[12]	Test-rmse:2976.89160
[13]	Test-rmse:2935.31445
[14]	Test-rmse:2901.69336
[15]	Test-rmse:2882.50024
[16]	Test-rmse:2871.74438
[17]	Test-rmse:2853.92310
[18]	Test-rmse:2847.78809
[19]	Test-rmse:2831.66284
[20]	Test-rmse:2820.27270
[21]	Test-rmse:2806.54492
[22]	Test-rmse:2801.78369
[23]	Test-rmse:2797.25732
[24]	Test-rmse:2792.64014
[25]	Test-rmse:2786.22461
[26]	Test-rmse:2782.07251
[27]	Test-rmse:2780.16797
[28]	Test-rmse:2777.12598
[29]	Test-rmse:2770.90771
[30]	Test-rmse:2764.07666
[31]	Test-rmse:2758.95459
[32]	Test-rmse:2756.15869
[33]	Test-rmse:2752.81396
[34]	Test-rmse:2751.79272
[35]	Test-rmse:2749.02319
[36]	Test-rmse:27

[306]	Test-rmse:2525.01465
[307]	Test-rmse:2524.95337
[308]	Test-rmse:2524.62354
[309]	Test-rmse:2524.27734
[310]	Test-rmse:2523.99536
[311]	Test-rmse:2523.95264
[312]	Test-rmse:2523.97900
[313]	Test-rmse:2523.82324
[314]	Test-rmse:2523.56128
[315]	Test-rmse:2523.49268
[316]	Test-rmse:2523.16455
[317]	Test-rmse:2522.76099
[318]	Test-rmse:2522.65894
[319]	Test-rmse:2522.54590
[320]	Test-rmse:2522.55078
[321]	Test-rmse:2522.37451
[322]	Test-rmse:2522.27270
[323]	Test-rmse:2522.10254
[324]	Test-rmse:2522.05103
[325]	Test-rmse:2521.96387
[326]	Test-rmse:2521.80444
[327]	Test-rmse:2521.78589
[328]	Test-rmse:2521.52783
[329]	Test-rmse:2521.37305
[330]	Test-rmse:2521.34644
[331]	Test-rmse:2521.35498
[332]	Test-rmse:2521.26269
[333]	Test-rmse:2521.28247
[334]	Test-rmse:2521.19849
[335]	Test-rmse:2520.97290
[336]	Test-rmse:2520.91260
[337]	Test-rmse:2520.85254
[338]	Test-rmse:2520.80249
[339]	Test-rmse:2520.81763
[340]	Test-rmse:2520.79565
[341]	Test-rmse:2520.73926
[342]	Test-rmse:2520.72852
[

### Predicting Values for Test data 

In [27]:
y_pred = model.predict(dpred)

### Preparing Data for Submission

In [28]:
submission = pd.DataFrame(y_pred, columns = ['Purchase'])
submission['User_ID'] = test['User_ID']
submission['Product_ID'] = test['Product_ID']
submission.head()

submission.to_csv('XGboost_submission.csv') 


Unnamed: 0,Purchase,User_ID,Product_ID
0,14928.404297,1000004,P00128942
1,10792.964844,1000009,P00113442
2,7175.628418,1000010,P00288442
3,4489.056641,1000010,P00145342
4,2342.98999,1000011,P00053842
