# Problem Statement
A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.
The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#train data
train=pd.read_csv('/content/blackFriday_train.csv')

In [5]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [6]:
#test data
test=pd.read_csv('/content/blackFriday_test.csv')

In [7]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [8]:
train.shape

(550068, 12)

In [9]:
test.shape

(233599, 11)

In [10]:
train.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [12]:
#dropping unnecessory columns
train=train.drop('User_ID',axis=1)

In [16]:
test=test.drop('User_ID',axis=1)

In [13]:
train.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,F,0-17,10,A,2,0,3,,,8370
1,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,P00087842,F,0-17,10,A,2,0,12,,,1422
3,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,P00285442,M,55+,16,C,4+,0,8,,,7969


In [14]:
# converting categorical data to numerical data
train['Gender']=train["Gender"].map({'F':0,'M':1})

In [15]:
test['Gender']=test['Gender'].map({'F':0,'M':1})

In [17]:
train['Gender'].head()

0    0
1    0
2    0
3    0
4    1
Name: Gender, dtype: int64

In [18]:
test['Gender'].head()

0    1
1    1
2    0
3    0
4    0
Name: Gender, dtype: int64

In [19]:
#unique values to the range of age
train.Age.unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [21]:
test.Age.unique()

array(['46-50', '26-35', '36-45', '18-25', '51-55', '55+', '0-17'],
      dtype=object)

In [22]:
#mapping the unique values to the age
train['Age']=train['Age'].map({'0-17':0, '55+':6, '26-35':2, '46-50':4, '51-55':5, '36-45':3, '18-25':1})

In [23]:
test['Age']=test['Age'].map({'0-17':0, '55+':6, '26-35':2, '46-50':4, '51-55':5, '36-45':3, '18-25':1})

In [25]:
train.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,0,0,10,A,2,0,3,,,8370
1,P00248942,0,0,10,A,2,0,1,6.0,14.0,15200
2,P00087842,0,0,10,A,2,0,12,,,1422
3,P00085442,0,0,10,A,2,0,12,14.0,,1057
4,P00285442,1,6,16,C,4+,0,8,,,7969


In [26]:
#conversion of city category
train.City_Category.unique()

array(['A', 'C', 'B'], dtype=object)

In [31]:
#one hot encoding
city_train=pd.get_dummies(train['City_Category'],drop_first=True)

In [32]:
city_test=pd.get_dummies(test['City_Category'],drop_first=True)

In [39]:
train=pd.concat([train,city_train],axis=1)

In [37]:
train=train.drop(['C'],axis=1)

In [42]:
train.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,B,C
0,P00069042,0,0,10,2,0,3,,,8370,0,0
1,P00248942,0,0,10,2,0,1,6.0,14.0,15200,0,0
2,P00087842,0,0,10,2,0,12,,,1422,0,0
3,P00085442,0,0,10,2,0,12,14.0,,1057,0,0
4,P00285442,1,6,16,4+,0,8,,,7969,0,1


In [41]:
train=train.drop(['City_Category'],axis=1)

In [44]:
test=pd.concat([test,city_test],axis=1)

In [47]:
test=test.drop(['City_Category'],axis=1)

In [48]:
test.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,B,C
0,P00128942,1,4,7,2,1,1,11.0,,1,0
1,P00113442,1,2,17,0,0,3,5.0,,0,1
2,P00288442,0,3,1,4+,1,5,14.0,,1,0
3,P00145342,0,3,1,4+,1,4,9.0,,1,0
4,P00053842,0,2,1,1,0,4,5.0,12.0,0,1


In [49]:
#handling the Nan values
train.isnull().sum()

Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
B                                  0
C                                  0
dtype: int64

In [50]:
test.isnull().sum()

Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2             72344
Product_Category_3            162562
B                                  0
C                                  0
dtype: int64

In [51]:
#calculate mode 
train.Product_Category_2.mode()

0    8.0
dtype: float64

In [52]:
train.Product_Category_3.mode()

0    16.0
dtype: float64

In [53]:
test.Product_Category_2.mode()

0    8.0
dtype: float64

In [54]:
test.Product_Category_3.mode()

0    16.0
dtype: float64

In [55]:
train['Product_Category_2']=train['Product_Category_2'].fillna(train['Product_Category_2'].mode()[0])

In [56]:
train['Product_Category_3']=train['Product_Category_3'].fillna(train['Product_Category_3'].mode()[0])

In [57]:
train.isnull().sum()

Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
B                             0
C                             0
dtype: int64

In [58]:
test['Product_Category_2']=test['Product_Category_2'].fillna(test['Product_Category_2'].mode()[0])

In [59]:
test['Product_Category_3']=test['Product_Category_3'].fillna(test['Product_Category_3'].mode()[0])

In [60]:
train.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,B,C
0,P00069042,0,0,10,2,0,3,8.0,16.0,8370,0,0
1,P00248942,0,0,10,2,0,1,6.0,14.0,15200,0,0
2,P00087842,0,0,10,2,0,12,8.0,16.0,1422,0,0
3,P00085442,0,0,10,2,0,12,14.0,16.0,1057,0,0
4,P00285442,1,6,16,4+,0,8,8.0,16.0,7969,0,1


In [61]:
#handing the str value of stay in current ccity
train.Stay_In_Current_City_Years.unique()

array(['2', '4+', '3', '1', '0'], dtype=object)

In [62]:
train['Stay_In_Current_City_Years']=train['Stay_In_Current_City_Years'].str.replace('+','')

In [63]:
train['Stay_In_Current_City_Years'].head()

0    2
1    2
2    2
3    2
4    4
Name: Stay_In_Current_City_Years, dtype: object

In [64]:
test['Stay_In_Current_City_Years']=test['Stay_In_Current_City_Years'].str.replace('+','')

In [65]:
test['Stay_In_Current_City_Years'].head()

0    2
1    0
2    4
3    4
4    1
Name: Stay_In_Current_City_Years, dtype: object

In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  550068 non-null  object 
 1   Gender                      550068 non-null  int64  
 2   Age                         550068 non-null  int64  
 3   Occupation                  550068 non-null  int64  
 4   Stay_In_Current_City_Years  550068 non-null  object 
 5   Marital_Status              550068 non-null  int64  
 6   Product_Category_1          550068 non-null  int64  
 7   Product_Category_2          550068 non-null  float64
 8   Product_Category_3          550068 non-null  float64
 9   Purchase                    550068 non-null  int64  
 10  B                           550068 non-null  uint8  
 11  C                           550068 non-null  uint8  
dtypes: float64(2), int64(6), object(2), uint8(2)
memory usage: 43.0+ MB


In [67]:
#changint the Dtype of object to int
train['Stay_In_Current_City_Years']=train['Stay_In_Current_City_Years'].astype(int)
train['B']=train['B'].astype(int)
train['C']=train['C'].astype(int)


In [68]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  550068 non-null  object 
 1   Gender                      550068 non-null  int64  
 2   Age                         550068 non-null  int64  
 3   Occupation                  550068 non-null  int64  
 4   Stay_In_Current_City_Years  550068 non-null  int64  
 5   Marital_Status              550068 non-null  int64  
 6   Product_Category_1          550068 non-null  int64  
 7   Product_Category_2          550068 non-null  float64
 8   Product_Category_3          550068 non-null  float64
 9   Purchase                    550068 non-null  int64  
 10  B                           550068 non-null  int64  
 11  C                           550068 non-null  int64  
dtypes: float64(2), int64(9), object(1)
memory usage: 50.4+ MB


In [69]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  233599 non-null  object 
 1   Gender                      233599 non-null  int64  
 2   Age                         233599 non-null  int64  
 3   Occupation                  233599 non-null  int64  
 4   Stay_In_Current_City_Years  233599 non-null  object 
 5   Marital_Status              233599 non-null  int64  
 6   Product_Category_1          233599 non-null  int64  
 7   Product_Category_2          233599 non-null  float64
 8   Product_Category_3          233599 non-null  float64
 9   B                           233599 non-null  uint8  
 10  C                           233599 non-null  uint8  
dtypes: float64(2), int64(5), object(2), uint8(2)
memory usage: 16.5+ MB


In [70]:
test['Stay_In_Current_City_Years']=test['Stay_In_Current_City_Years'].astype(int)
test['B']=test['B'].astype(int)
test['C']=test['C'].astype(int)


In [71]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  233599 non-null  object 
 1   Gender                      233599 non-null  int64  
 2   Age                         233599 non-null  int64  
 3   Occupation                  233599 non-null  int64  
 4   Stay_In_Current_City_Years  233599 non-null  int64  
 5   Marital_Status              233599 non-null  int64  
 6   Product_Category_1          233599 non-null  int64  
 7   Product_Category_2          233599 non-null  float64
 8   Product_Category_3          233599 non-null  float64
 9   B                           233599 non-null  int64  
 10  C                           233599 non-null  int64  
dtypes: float64(2), int64(8), object(1)
memory usage: 19.6+ MB


In [72]:
# dropping the features not required
train=train.drop(['Product_ID'],axis=1)
test=test.drop(['Product_ID'],axis=1)

In [73]:
train_y=train['Purchase']

In [74]:
train=train.drop(['Purchase'],axis=1)


In [75]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(train,train_y,test_size=0.25,random_state=42)

In [78]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [87]:
from sklearn.ensemble import RandomForestRegressor

rc=RandomForestRegressor(n_estimators=200)
rc.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [88]:
y_pred=rc.predict(x_test)

In [89]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred,squared=False)

3052.54125928508