In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Analytics Vidhya: Black Friday Sales Prediction**

A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month. The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

**Variables Definition**

User_ID- User ID

Product_ID - Product ID

Gender - Sex of User

Age - Age in bins

Occupation - Occupation (Masked)

City_Category - Category of the City (A,B,C)

Stay_In_Current_City_Years - Number of years stay in current city

Marital_Status - Marital Status

Product_Category_1 - Product Category (Masked)

Product_Category_2 - Product may belongs to other category also (Masked)

Product_Category_3 - Product may belongs to other category also (Masked)

Purchase - Purchase Amount (Target Variable)

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-darkgrid')
np.random.seed(42)

from scipy import stats
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
train_data = pd.read_csv('../input/black-friday-sales-prediction/train_oSwQCTC (1)/train.csv')
test_data = pd.read_csv('../input/black-friday-sales-prediction/test_HujdGe7 (1)/test.csv')
train_data.columns = train_data.columns.str.strip().str.lower().str.replace(' ', '_')
test_data.columns = test_data.columns.str.strip().str.lower().str.replace(' ', '_')
print('Train Data shape: ', train_data.shape)
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe()

**Checking for Null values**

In [None]:
train_data.isnull().sum()

In [None]:
Total = train_data.isnull().sum().sort_values(ascending=False)          
Percent = (train_data.isnull().sum()*100/train_data.isnull().count()).sort_values(ascending=False)   

missing_data = pd.concat([Total, Percent], axis = 1, keys = ['Total', 'Percentage of Missing Values'])    
missing_data

In [None]:
train_data['type'] = 'train'
test_data['type'] = 'test'

master_data = pd.concat([train_data, test_data])
master_data.head()

**Exploratory Data Analysis**

In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(master_data["purchase"]);

plt.subplot(122)
master_data["purchase"].plot.box(figsize = (16, 6))
plt.show()

**Purchase has unequal distribution**

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["gender"].value_counts())
master_data['gender'].value_counts(normalize = True).plot.bar(title = 'Gender')

**There are more males than females**

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["age"].value_counts())
master_data['age'].value_counts(normalize = True).plot.bar(title = 'Age')

**26-35 age group people has the highest count**

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["stay_in_current_city_years"].value_counts())
master_data['stay_in_current_city_years'].value_counts(normalize = True).plot.bar(title = 'Stay in current city - Years')

**Maximum people stay in current city for 1 year**

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["occupation"].value_counts())
master_data['occupation'].value_counts(normalize = True).plot.bar(title = 'Occupation')

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["city_category"].value_counts())
master_data['city_category'].value_counts(normalize = True).plot.bar(title = 'City Category')

**Category B city has the highest count**

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["marital_status"].value_counts())
master_data['marital_status'].value_counts(normalize = True).plot.bar(title = 'Martial Status')

**Maximum people have not married yet**

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["product_category_1"].value_counts())
master_data['product_category_1'].value_counts(normalize = True).plot.bar(title = 'Product Category 1')

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["product_category_2"].value_counts())
master_data['product_category_2'].value_counts(normalize = True).plot.bar(title = 'Product Category 2')

In [None]:
plt.figure(figsize = (10, 5))
print(master_data["product_category_3"].value_counts())
master_data['product_category_3'].value_counts(normalize = True).plot.bar(title = 'Product Category 3')

In [None]:
plt.figure(figsize = (12, 6))
prod_by_cat = master_data.groupby('product_category_1')['product_id'].nunique()

sns.barplot(x = prod_by_cat.index,y = prod_by_cat.values)
plt.title('Number of Unique Items per Category')
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
sns.violinplot(x = 'city_category', y = 'purchase', hue = 'marital_status', data = master_data)

**Almost all the city categories have equal proportion of males and females**

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(master_data["gender"], hue = master_data["age"]).set_title("Age & Gender")

**26-35 age group people with respect to Males is the highest also females in that age group is also the highest.**

In [None]:
plt.figure(figsize = (10, 5))
x = master_data.groupby(["gender"]).mean()[["purchase"]].index
y = master_data.groupby(["gender"]).mean()[["purchase"]].values
plt.plot(x, y,"ro")
plt.xticks(x, ["male", "female"])
plt.title("Mean purchase of different gender")
sns.despine()

**Females have higher mean purchases compared to males**

In [None]:
plt.figure(figsize = (10, 5))
master_data.groupby("city_category")["purchase"].sum().plot.pie(title = "City Categry", 
                                                               startangle = 90, explode = (0.1, 0, 0), 
                                                               autopct = "%1.1f%%", shadow = True)

In [None]:
plt.figure(figsize = (10, 5))
x = master_data.groupby(["city_category"]).mean()[["purchase"]].index
y = master_data.groupby(["city_category"]).mean()[["purchase"]].values
plt.plot(x, y,"ro")
plt.title("Mean purchase of different city categories")

**Category city C has the highest mean purchase.**

In [None]:
plt.figure(figsize = (10, 5))
master_data["stay_in_current_city_years"].value_counts().plot.pie(title = "Years of staying in the city", 
                                                                 explode = (0.1, 0, 0, 0, 0), 
                                                                 autopct = "%1.1f%%", shadow = True)

**Data Preprocessing**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
train_data['user_id'] = train_data['user_id'] - 1000000
test_data['user_id'] = test_data['user_id'] - 1000000

enc = LabelEncoder()
train_data['user_id'] = enc.fit_transform(train_data['user_id'])
test_data['user_id'] = enc.transform(test_data['user_id'])

In [None]:
train_data['product_id'] = train_data['product_id'].str.replace('P00', '')
test_data['product_id'] = test_data['product_id'].str.replace('P00', '')

scaler = StandardScaler()
train_data['product_id'] = scaler.fit_transform(train_data['product_id'].values.reshape(-1, 1))
test_data['product_id'] = scaler.transform(test_data['product_id'].values.reshape(-1, 1))

In [None]:
categorical_col = ['gender', 'city_category']
numerical_col = ['age', 'occupation', 'stay_in_current_city_years', 'product_category_1', 'product_category_2', 'product_category_3']

In [None]:
train_data['age']=train_data['age'].replace('0-17',17)
train_data['age']=train_data['age'].replace('18-25',25)
train_data['age']=train_data['age'].replace('26-35',35)
train_data['age']=train_data['age'].replace('36-45',45)
train_data['age']=train_data['age'].replace('46-50',50)
train_data['age']=train_data['age'].replace('51-55',55)
train_data['age']=train_data['age'].replace('55+',60)

In [None]:
test_data['age']=test_data['age'].replace('0-17',17)
test_data['age']=test_data['age'].replace('18-25',25)
test_data['age']=test_data['age'].replace('26-35',35)
test_data['age']=test_data['age'].replace('36-45',45)
test_data['age']=test_data['age'].replace('46-50',50)
test_data['age']=test_data['age'].replace('51-55',55)
test_data['age']=test_data['age'].replace('55+',60)

In [None]:
train_data['stay_in_current_city_years']=train_data['stay_in_current_city_years'].replace('4+',4)
test_data['stay_in_current_city_years']=test_data['stay_in_current_city_years'].replace('4+',4)

**Imputing null values**

In [None]:
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

**Encoding categorical columns**

In [None]:
encoder = LabelEncoder()

for col in categorical_col:
    train_data[col] = encoder.fit_transform(train_data[col])
    test_data[col] = encoder.transform(test_data[col])

**Scaling numerical columns**

In [None]:
scaler = StandardScaler()

for col in numerical_col:
    train_data[col] = scaler.fit_transform(train_data[col].values.reshape(-1, 1))
    test_data[col] = scaler.transform(test_data[col].values.reshape(-1, 1))

In [None]:
train_data

In [None]:
X_train.drop(['type'], axis = 1)

In [None]:
X_val.drop(['type'], axis = 1)

**Splitting the dataset**

In [None]:
X = train_data.drop(['purchase','type'], axis=1)
y = train_data[['purchase']]
X_test = test_data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

In [None]:
train_data.columns

In [None]:
train_data.drop(['type'], axis = 1)

In [None]:
y_train

In [None]:
X_val.columns

In [None]:
X_val.drop(['type'], axis = 1)

**Model Building**

In [None]:
from sklearn import linear_model
reg=linear_model.LinearRegression()
lm_model=reg.fit(X_train,y_train)
pred=lm_model.predict(X_val)

In [None]:
np.sqrt(mean_squared_error(y_val,pred))

In [None]:
xgb_reg = XGBRegressor(learning_rate=1.0, max_depth=6, min_child_weight=40, seed=0)

xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_pred, y_val))

print (xgb_reg)

In [None]:
rmse

**We are getting an root mean square error of 2591.85.**

****XG Boost model is performing well with the above hyperparameters compared to Linear Regression.****