In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import the libraries and the data

In [None]:
#Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
#Import the training set
train_df=pd.read_csv("../input/black-friday/train.csv")
test_df=pd.read_csv("../input/black-friday/test.csv")

In [None]:
#Creating a checkpoint
df=train_df.copy()

In [None]:
#Get the data information
train_df.info()

In [None]:
test_df.info()

In [None]:
#Let's check the first 5 rows
train_df.head()

In [None]:
test_df.head()

## Data Exploration and PreProcessing

In [None]:
#Remove the User_ID field
train_df.drop('User_ID',axis=1,inplace=True)
test_df.drop('User_ID',axis=1,inplace=True)

In [None]:
train_df.shape

In [None]:
#Checking the basic stats from the dataset
train_df.describe()

In [None]:
#Checking the 'Age' column unique values
train_df['Age'].unique()

In [None]:
test_df['Age'].unique()

In [None]:
#Mapping the data to numeric valuess
train_df['Age']=train_df['Age'].map({'0-17':0,'18-25':1, '26-35':2,'36-45':3,'46-50':4, '51-55':5, '55+':6})
test_df['Age']=test_df['Age'].map({'0-17':0,'18-25':1, '26-35':2,'36-45':3,'46-50':4, '51-55':5, '55+':6})


In [None]:
#Check the Age column
train_df['Age'].unique()

In [None]:
#Check the Gender Column
train_df['Gender'].unique()

In [None]:
test_df['Gender'].unique()

In [None]:
train_df['Gender']=train_df['Gender'].map({'F':0,'M':1})
test_df['Gender']=test_df['Gender'].map({'F':0,'M':1})

In [None]:
#Check the Gender Column for confirmation
train_df['Gender'].unique()

In [None]:
#Check the Marital Status Column
train_df['Marital_Status'].unique()

In [None]:
#Get the different City Category
train_df['City_Category'].unique()

In [None]:
#One hot encoding for City Category
city=pd.get_dummies(train_df['City_Category'],drop_first=True)

In [None]:
city

In [None]:
#Concatenate the City df to train_df
train_df=pd.concat([train_df,city],axis=1)

In [None]:
#dropping the City_Category columns
train_df.drop('City_Category',axis=1,inplace=True)

In [None]:
train_df.head()

In [None]:
city_test=pd.get_dummies(test_df['City_Category'],drop_first=True)

In [None]:
#Concatenate the City df to test_df
test_df=pd.concat([test_df,city_test],axis=1)

In [None]:
#Removing Product_Id
train_df.drop('Product_ID',axis=1,inplace=True)

In [None]:
test_df.drop('Product_ID',axis=1,inplace=True)

In [None]:
test_df.info()

## Missing values imputation

In [None]:
percent_missing=np.round((train_df.isna().sum()/train_df.isna().count()),3)
percent_missing.sort_values(ascending=False)

In [None]:
#Only Product_Category_2(almost 32%) and Product_Category_3(almost 70%) has null values

In [None]:
#Removing Product_Category_3
train_df.drop('Product_Category_3',axis=1,inplace=True)

In [None]:
#Check the Product Category 2
train_df['Product_Category_2'].value_counts()

In [None]:
#Impute the missing values with mode value for Product_Category_2
train_df['Product_Category_2']=train_df['Product_Category_2'].fillna(train_df['Product_Category_2'].mode()[0])

In [None]:
#Check for missing values
train_df['Product_Category_2'].isna().sum()

In [None]:
#Same for Test Data
percent_missing=np.round((test_df.isna().sum()/test_df.isna().count()),3)
percent_missing.sort_values(ascending=False)

In [None]:
#Only Product_Category_2(31%) and Product_Category_3(almost 70%) has null values

In [None]:
#Removing Product_Category_3
test_df.drop('Product_Category_3',axis=1,inplace=True)

In [None]:
test_df['Product_Category_2']=test_df['Product_Category_2'].fillna(train_df['Product_Category_2'].mode()[0])

### Check the data types

In [None]:
train_df.info()

In [None]:
train_df['Stay_In_Current_City_Years'].unique()

In [None]:
train_df['Stay_In_Current_City_Years']=train_df['Stay_In_Current_City_Years'].str.replace('+','')

In [None]:
test_df['Stay_In_Current_City_Years']=test_df['Stay_In_Current_City_Years'].str.replace('+','')

In [None]:
train_df['Stay_In_Current_City_Years']=train_df['Stay_In_Current_City_Years'].astype(int)
train_df['B']=train_df['B'].astype(int)
train_df['C']=train_df['C'].astype(int)

In [None]:
test_df['Stay_In_Current_City_Years']=test_df['Stay_In_Current_City_Years'].astype(int)
test_df['B']=test_df['B'].astype(int)
test_df['C']=test_df['C'].astype(int)

In [None]:
train_df['Product_Category_2']

In [None]:
train_df['Product_Category_2']=train_df['Product_Category_2'].astype(float)

In [None]:
test_df.info()

In [None]:
#We can see this column is not present in Training dataset
test_df['City_Category']

In [None]:
#Dropping this column
test_df.drop('City_Category',axis=1,inplace=True)

# Exploratory Data Analysis

### Gender- Who purchases more?

In [None]:
sns.barplot('Gender','Purchase',data=train_df)

### Though the difference is not large, still men are purchasing more than women.

### Age- Which age Category is purchasing more

In [None]:
sns.barplot('Age','Purchase',data=train_df)

#### There is not much difference of the purchasing between different Age Categories.

## Marital Status-Married People purcases more than Single?

In [None]:
sns.barplot('Marital_Status','Purchase',data=train_df)

### There is not much difference in Purchase between Married and Single People

In [None]:
sns.barplot('Occupation','Purchase',data=train_df)

# Model Building

In [None]:
X=train_df.drop('Purchase',axis=1)
y=train_df['Purchase']

### Splitting into Train and Validation Set

In [None]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.5,random_state=42)

### Random Forest Regressor

In [None]:
rfr=RandomForestRegressor(n_estimators=150)
rfr.fit(X_train,y_train)
rfrpredict=rfr.predict(X_valid)

### Gradient Boosting Regressor

In [None]:
gbr=GradientBoostingRegressor()
gbr.fit(X_train,y_train)
gbrpredict= gbr.predict(X_valid)

### XGB Regressor

In [None]:
xgr=XGBRegressor()
xgr.fit(X_train,y_valid)
xgrpredict=xgr.predict(X_valid)

### Verify the RMSE code

In [None]:
print("RMSE score for Random_Forest : ", np.sqrt(mean_squared_error(y_valid,rfrpredict)))
print("RMSE score for Gradient Boosting : ", np.sqrt(mean_squared_error(y_valid,gbrpredict)))
print("RMSE score for XG Boosting : ", np.sqrt(mean_squared_error(y_valid,xgrpredict)))

In [None]:
finalpredict=gbr.predict(test_df)

In [None]:
finalpredict