#### Black Friday Dataset

Cleaning and preparing the data for model training

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


#### Problem Statement

A retail company 'ABC Private Limited' wants to understand the customer purchase behaviour(specifically, purchase amount) against various products of different categories. They have shared purcase summary of various customers for selected high volume products from the last month. The data set also contains customer demographs(age,gender,marital status,city_type,stay_in,current_city),product details(product_id and product_category) and total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

In [24]:
# import datasets
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [25]:
# import the test data
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [21]:
## Merge both train and test data for cleaning
df = pd.concat([df_train,df_test],ignore_index=True,axis=0)
df

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370.0
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422.0
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057.0
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969.0
...,...,...,...,...,...,...,...,...,...,...,...,...
783662,1006036,P00118942,F,26-35,15,B,4+,1,8,,,
783663,1006036,P00254642,F,26-35,15,B,4+,1,5,8.0,,
783664,1006036,P00031842,F,26-35,15,B,4+,1,1,5.0,12.0,
783665,1006037,P00124742,F,46-50,1,C,4+,0,10,16.0,,


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783667 entries, 0 to 783666
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  783667 non-null  object 
 1   Gender                      783667 non-null  object 
 2   Age                         783667 non-null  object 
 3   Occupation                  783667 non-null  int64  
 4   City_Category               783667 non-null  object 
 5   Stay_In_Current_City_Years  783667 non-null  object 
 6   Marital_Status              783667 non-null  int64  
 7   Product_Category_1          783667 non-null  int64  
 8   Product_Category_2          537685 non-null  float64
 9   Product_Category_3          237858 non-null  float64
 10  Purchase                    550068 non-null  float64
dtypes: float64(3), int64(3), object(5)
memory usage: 65.8+ MB


In [45]:
df.describe()

Unnamed: 0,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,783667.0,783667.0,783667.0,537685.0,237858.0,550068.0
mean,8.0793,0.409777,5.366196,9.844506,12.668605,9263.968713
std,6.522206,0.491793,3.87816,5.089093,4.12551,5023.065394
min,0.0,0.0,1.0,2.0,3.0,12.0
25%,2.0,0.0,1.0,5.0,9.0,5823.0
50%,7.0,0.0,5.0,9.0,14.0,8047.0
75%,14.0,1.0,8.0,15.0,16.0,12054.0
max,20.0,1.0,20.0,18.0,18.0,23961.0


In [None]:
# User_id is not useful to predict anything, so deleting it.
df.drop('User_ID',axis=1,inplace='True')

In [50]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,F,0-17,10,A,2,0,3,,,8370.0
1,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,P00087842,F,0-17,10,A,2,0,12,,,1422.0
3,P00085442,F,0-17,10,A,2,0,12,14.0,,1057.0
4,P00285442,M,55+,16,C,4+,0,8,,,7969.0


In [51]:
# Categorical features - Gender,Age,Occupation,city,stay_in_current_city_years and so on
# Let's try to convert categorical varibales into numericals

In [53]:
pd.get_dummies(df['Gender'])

Unnamed: 0,F,M
0,True,False
1,True,False
2,True,False
3,True,False
4,False,True
...,...,...
783662,True,False
783663,True,False
783664,True,False
783665,True,False


In [54]:
# Handing categorical feature gender
df['Gender']=df['Gender'].map({'F':0,'M':1})
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,0,0-17,10,A,2,0,3,,,8370.0
1,P00248942,0,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,P00087842,0,0-17,10,A,2,0,12,,,1422.0
3,P00085442,0,0-17,10,A,2,0,12,14.0,,1057.0
4,P00285442,1,55+,16,C,4+,0,8,,,7969.0


In [55]:
# Handling categorial feature age
df['Age'].unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [56]:
df['Age']=df['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7})
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,0,1,10,A,2,0,3,,,8370.0
1,P00248942,0,1,10,A,2,0,1,6.0,14.0,15200.0
2,P00087842,0,1,10,A,2,0,12,,,1422.0
3,P00085442,0,1,10,A,2,0,12,14.0,,1057.0
4,P00285442,1,7,16,C,4+,0,8,,,7969.0


In [57]:
# Handling City category


array([10, 16, 15,  7, 20,  9,  1, 12, 17,  0,  3,  4, 11,  8, 19,  2, 18,
        5, 14, 13,  6], dtype=int64)