### A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.
### The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

### Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Import libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# 'metrics' from sklearn is used for evaluating the model performance
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
#read train data
train_test = pd.read_csv('/kaggle/input/black-friday/train.csv')

#read validate data
df_validate = pd.read_csv('/kaggle/input/black-friday/test.csv')

In [None]:
train_test.head()

In [None]:
df_validate.head()

# Working on train_test data

In [None]:
df = train_test.copy()
df.head(1)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

### EDA

##### User_ID 


In [None]:

df['User_ID'].nunique()
#there are 5891 unique values

In [None]:
df['User_ID'].value_counts()

In [None]:
plt.figure(figsize=(10,7))
df['User_ID'].value_counts().head(10).plot(kind='bar')
plt.xlabel('User_id')
plt.ylabel('Count')

#User_id 1001680, has purchased most number of products

In [None]:
df.groupby(by='User_ID')['Purchase'].sum().nlargest(5)

##### Product_id

In [None]:
df['Product_ID'].value_counts()

In [None]:
df['Product_ID'].nunique()

##### Gender

In [None]:
df['Gender'].value_counts()

In [None]:
pd.pivot_table(df, values='Purchase', index='Gender', aggfunc=np.sum)

#Male customers have purchased 3times more than female

In [None]:
pd.pivot_table(df, values='Purchase', index='Gender', aggfunc=np.mean)
#On average male customer prchase more

##### Age

In [None]:
df['Age'].nunique()

In [None]:
df['Age'].unique()

In [None]:
pd.pivot_table(df, values='Purchase', index='Age', aggfunc=np.mean)

In [None]:
pd.pivot_table(df, values='Purchase', index='Age', aggfunc=np.sum)

##### Occupation

In [None]:
df['Occupation'].nunique()

In [None]:
df['Occupation'].unique()

##### City_Category

In [None]:
df['City_Category'].unique()

##### Stay_In_Current_City_Years

In [None]:
df['Stay_In_Current_City_Years'].unique()

##### Purchase

In [None]:
sns.boxplot(df['Purchase'])
#there are outliers

In [None]:
sns.distplot(df['Purchase'])

### Data pre-processing

In [None]:
df.info()

#### Duplicate records

In [None]:
df.duplicated().value_counts()

#There are no duplicate values

#### Outier analysis


In [None]:

q1 = df['Purchase'].quantile(0.25)

q3 = df['Purchase'].quantile(0.75)

iqr = q3 - q1

ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr

In [None]:
df[(df['Purchase']<ll) | (df['Purchase']>ul)].shape

In [None]:
2677/550068 * 100
#There are 0.48% of outliers in overall data

In [None]:
df = df[~((df['Purchase']<ll) | (df['Purchase']>ul))]

In [None]:
df.shape

#### Null value

In [None]:
df.isnull().sum()

In [None]:
# there are null values in Product_Category_2 & Product_Category_3

In [None]:
df['Product_Category_2'].unique()
#Product may belongs to other category also

In [None]:
#we impute the null values to Product_Category_2 & Product_Category_3 with 0, becuase they dont belong to any category

In [None]:
df = df.fillna(0)

In [None]:
df.isnull().sum()

####  Split the data

In [None]:
y = df['Purchase']

X = df.drop('Purchase', axis=1)

#### LAbel encoding

In [None]:
X.info()

In [None]:
#Product_ID is insignificant, so we remove that variable

X.drop('Product_ID', axis=1, inplace=True)

In [None]:
X.drop('User_ID', axis=1, inplace=True)

In [None]:
#changing datatype of Product_Category_2 & Product_Category_3 to int

X[['Product_Category_2', 'Product_Category_3']] = X[['Product_Category_2', 'Product_Category_3']].astype('int')

In [None]:
X.info()

In [None]:
cat_cols = list(X.select_dtypes(exclude='number').columns)
cat_cols

In [None]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [None]:
X.shape

### train test split with 30% of test data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=4)

In [None]:
X_train.shape, X_test.shape

# Model building

## 1. Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import  mean_squared_error

In [None]:
lir = LinearRegression()
lir.fit(X_train, y_train)

In [None]:
y_test_pred = lir.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

#### 4495.2294

## 1.1 Removing user_id columns

In [None]:
X_train.columns

In [None]:
lir = LinearRegression()
lir.fit(X_train, y_train)

In [None]:
y_test_pred = lir.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

#### 4495.2885

## 1.2 Binning user_id columns

In [None]:
def user_bin(x):
    if x < 1001000:
        return 1
    elif x < 1002000:
        return 2
    elif x < 1003000:
        return 3
    elif x < 1004000:
        return 4
    elif x < 1005000:
        return 5
    else:
        return 6

In [None]:
#X_train['User_ID'] = X_train['User_ID'].apply(lambda x : user_bin(x))

#X_test['User_ID'] = X_test['User_ID'].apply(lambda x : user_bin(x))


In [None]:
lir = LinearRegression()
lir.fit(X_train, y_train)

In [None]:
y_test_pred = lir.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

#### 4495.2325
##### There is no effect with user_id column, so we remove that

## 1.3 Label encode ==> occupation, product category

In [None]:
#X.drop('User_ID', axis=1, inplace=True)

In [None]:
X.shape

In [None]:
X.columns

In [None]:
X1 = pd.get_dummies(X, columns=['Occupation', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3'], drop_first=True)

In [None]:
X1.shape

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y, test_size=0.3, random_state=4)

In [None]:
lir = LinearRegression()
lir.fit(X_train1, y_train1)

In [None]:
y_test_pred = lir.predict(X_test1)

mse = mean_squared_error(y_test1, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

#### 2950.96, this method worked abit

## 1.4 Adding new column, total_amount

In [None]:
df.head()

In [None]:
X['User_ID'] = df['User_ID']

In [None]:
y.head()

In [None]:
user_sum = pd.pivot_table(df,  values='Purchase', index='User_ID',  aggfunc=np.sum)
user_sum.head(5)

In [None]:
user_sum.columns = ['Total_purchase']

In [None]:
user_sum['User_ID'] = user_sum.index

user_sum.reset_index(drop=True, inplace=True)

In [None]:
user_sum.head(5)

In [None]:
#join this to main dataframe

X_new = pd.merge(X, user_sum, on='User_ID')

In [None]:
X_new['Total_purchase'].nunique()

In [None]:
X_new['User_ID'].nunique()

In [None]:
X_new.drop('User_ID', axis=1, inplace=True)

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_new,y, test_size=0.3, random_state=4)

In [None]:
lir = LinearRegression()
lir.fit(X_train1, y_train1)

In [None]:
y_test_pred = lir.predict(X_test1)

mse = mean_squared_error(y_test1, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

#### 4933.8718, not at all usefull

## 1.5 RFE

In [None]:
from sklearn.feature_selection import RFE, RFECV

In [None]:
X.drop('User_ID', axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=4)

In [None]:
lir = LinearRegression()

#RUN RFECV to find out the best number of features to be selected
rfe_n = RFECV(estimator=lir, cv=3, scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1)
rfe_n.fit(X, y)

In [None]:
#Number
print('Number of features selected :', rfe_n.n_features_)

In [None]:
#Selected features
selected = list(X.columns[rfe_n.support_])
print('\nSelected features :',selected)

In [None]:
#selecting only features from RFE in both train & test dataset

X_train_sel = X_train[selected]
X_test_sel = X_test[selected]

In [None]:
X.shape

In [None]:
lir = LinearRegression()
lir.fit(X_train_sel, y_train)

In [None]:
y_test_pred = lir.predict(X_test_sel)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

#### 4495.2841, we can try RFE on 1.3

In [None]:
X1 = pd.get_dummies(X, columns=['Occupation', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3'], drop_first=True)

In [None]:
X1.shape

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y, test_size=0.3, random_state=4)

In [None]:
lir = LinearRegression()

#RUN RFECV to find out the best number of features to be selected
rfe_n = RFECV(estimator=lir, cv=3, scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1)
rfe_n.fit(X1, y)

In [None]:
#Number
print('Number of features selected :', rfe_n.n_features_)

In [None]:
#Selected features
selected = list(X1.columns[rfe_n.support_])
print('\nSelected features :',selected)

In [None]:
#selecting only features from RFE in both train & test dataset

X_train_sel = X_train1[selected]
X_test_sel = X_test1[selected]

In [None]:
lir = LinearRegression()
lir.fit(X_train_sel, y_train1)

In [None]:
y_test_pred = lir.predict(X_test_sel)

mse = mean_squared_error(y_test1, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

#### 2950.9553, this is not effective. We will consider 1.3 as final linear regression

### Final Linear regression

In [None]:
X.shape

In [None]:
X1 = pd.get_dummies(X, columns=['Occupation', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3'], drop_first=True)

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y, test_size=0.3, random_state=4)

In [None]:
lir = LinearRegression()
lir.fit(X_train1, y_train1)

In [None]:
y_test_pred = lir.predict(X_test1)

mse = mean_squared_error(y_test1, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)


In [None]:
#remove product and user id, null imputation, label encode
X1.shape

In [None]:
#read validate data
df_validate = pd.read_csv('/kaggle/input/black-friday/test.csv')

df_validate.drop(['User_ID', 'Product_ID'], axis=1, inplace=True)

In [None]:
df_validate_copy = pd.read_csv('/kaggle/input/black-friday/test.csv')

In [None]:
df_validate.isnull().sum()

In [None]:
df_validate = df_validate.fillna(0)

In [None]:
df_validate[['Product_Category_1', 'Product_Category_2', 'Product_Category_3']] = df_validate[['Product_Category_1', 'Product_Category_2', 'Product_Category_3']].astype('int')

In [None]:
df_validate.info()

In [None]:
df_validate = pd.get_dummies(df_validate, columns=df_validate.columns, drop_first=True)

In [None]:
X1_cols = list(X1.columns)
val_cols = list(df_validate.columns)
for i in X1_cols:
    if i not in val_cols:
        print(i)

In [None]:
df_validate[['Product_Category_1_19', 'Product_Category_1_20' ]] = 0

In [None]:
df_validate.shape

In [None]:
val_pred = lir.predict(df_validate)

In [None]:
type(val_pred)

In [None]:
submission_1 = pd.DataFrame(val_pred, columns=['Purchase'])

In [None]:
submission_1['User_ID'] = df_validate_copy['User_ID']

In [None]:
submission_1['Product_ID'] = df_validate_copy['Product_ID']

In [None]:
submission_1.to_csv('submission_1.csv')

In [None]:
submission_1.head()

In [None]:
#without those label encoding

In [None]:
lir = LinearRegression()
lir.fit(X_train, y_train)

In [None]:
y_test_pred = lir.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

In [None]:
X_train.shape

In [None]:
df_validate = pd.get_dummies(df_validate, columns=df_validate.select_dtypes(exclude='number').columns, drop_first=True)

In [None]:
#val_pred = lir.predict(df_validate)

In [None]:
#submission_1_1 = pd.DataFrame(val_pred, columns=['Purchase'] )

#submission_1_1['User_ID'] = df_validate_copy['User_ID']

#submission_1_1['Product_ID'] = df_validate_copy['Product_ID']

#submission_1_1.to_csv('submission_1_1.csv')

# 2. DTC

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dtc = DecisionTreeRegressor()
dtc.fit(X_train, y_train)

In [None]:
y_test_pred = dtc.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

In [None]:
#val_pred = dtc.predict(df_validate)

#submission_dtc = pd.DataFrame(val_pred, columns=['Purchase'] )

#submission_dtc['User_ID'] = df_validate_copy['User_ID']

#submission_dtc['Product_ID'] = df_validate_copy['Product_ID']

#submission_dtc.to_csv('submission_dtc.csv')

### 3316.0395

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from scipy.stats import randint as sp_randint

In [None]:
# GridSearchCV & RandomizedSearchCV results were almost similar
# We are considering RandomizedSearchCV for Hyper-parameter tuning

dtc = DecisionTreeRegressor(random_state=4)

params = {'max_depth' : sp_randint(2,10),
         'min_samples_leaf' : sp_randint(1,12)}

rsearch = RandomizedSearchCV(dtc, param_distributions=params, n_iter=25, n_jobs=-1, 
                             cv=3, scoring='neg_root_mean_squared_error', random_state=4)

# RandomizedSearchCV on overall transformed datasets
rsearch.fit(X,y)

In [None]:
#Best parameters
print(rsearch.best_params_)

In [None]:
dtc = DecisionTreeRegressor(**rsearch.best_params_, random_state=4)
dtc.fit(X_train, y_train)

y_test_pred = dtc.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)


### 2931.1061

In [None]:
#val_pred = dtc.predict(df_validate)

In [None]:
#submission_dtc_hp = pd.DataFrame(val_pred, columns=['Purchase'] )

#submission_dtc_hp['User_ID'] = df_validate_copy['User_ID']

#submission_dtc_hp['Product_ID'] = df_validate_copy['Product_ID']

#submission_dtc_hp.to_csv('submission_dtc_hp.csv')

# RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(random_state=4)
rfr.fit(X_train, y_train)

y_test_pred = rfr.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

In [None]:
X

In [None]:
#val_pred = rfr.predict(df_validate)

#submission_rfr = pd.DataFrame(val_pred, columns=['Purchase'] )

#submission_rfr['User_ID'] = df_validate_copy['User_ID']

#submission_rfr['Product_ID'] = df_validate_copy['Product_ID']

#submission_rfr.to_csv('submission_rfr.csv')

In [None]:
rfr = RandomForestRegressor(random_state=4)
rfr.fit(X_train1, y_train1)

y_test_pred = rfr.predict(X_test1)

mse = mean_squared_error(y_test1, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

### RandomForest Hyperparameter tuning

In [None]:
rfr = RandomForestRegressor(random_state=4)


params = {'n_estimators': sp_randint(50,200),
         'max_features': sp_randint(1,15),
         'min_samples_leaf' : sp_randint(1,25),
          'max_depth' : sp_randint(1,10)}

rsearch = RandomizedSearchCV(rfr, param_distributions=params, cv=3, n_iter=30, verbose=2, 
                             scoring='neg_root_mean_squared_error', random_state=4, n_jobs=-1)
rsearch.fit(X, y)

In [None]:
#Best parameters
print(rsearch.best_params_)

In [None]:
rfr = RandomForestRegressor(**rsearch.best_params_, random_state=4)
rfr.fit(X_train, y_train)

y_test_pred = rfr.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)


In [None]:
pd.DataFrame(rsearch.cv_results_).head(1)

In [None]:
#val_pred = rfr.predict(df_validate)

#submission_rfr = pd.DataFrame(val_pred, columns=['Purchase'] )

#submission_rfr['User_ID'] = df_validate_copy['User_ID']

#submission_rfr['Product_ID'] = df_validate_copy['Product_ID']

#submission_rfr.to_csv('submission_rfr_hp_old.csv')

# LGBMClassifier

In [None]:
import lightgbm as lgb

In [None]:
lgbc = lgb.LGBMRegressor()
lgbc.fit(X_train, y_train)

In [None]:
y_test_pred = lgbc.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

## LGBMClassifier with Hyper-parameter tuning

In [None]:
from scipy.stats import uniform as sp_uniform

In [None]:
lgbc = lgb.LGBMRegressor()

params = {'n_estimators':sp_randint(50,250),
         'max_depth' : sp_randint(1,50),
         'learning_rate' : sp_uniform(0,0.5)}

rsearch = RandomizedSearchCV(lgbc, param_distributions=params, scoring='neg_root_mean_squared_error', cv=3, n_iter=50,
                             n_jobs=-1, random_state=4)
rsearch.fit(X, y)

In [None]:
#Best parameters
print(rsearch.best_params_)

In [None]:
lgbc = lgb.LGBMRegressor(**rsearch.best_params_, random_state=4)
lgbc.fit(X_train, y_train)

In [None]:
y_test_pred = lgbc.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

In [None]:
#val_pred = lgbc.predict(df_validate)

#submission = pd.DataFrame(val_pred, columns=['Purchase'] )
#
#submission['User_ID'] = df_validate_copy['User_ID']

#submission['Product_ID'] = df_validate_copy['Product_ID']

#submission.to_csv('submission_lgbm_hp_old.csv')

In [None]:
#df_validate = pd.get_dummies(df_validate, columns=['Gender','Age', 'City_Category', 'Stay_In_Current_City_Years' ], drop_first=True)

# KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn=KNeighborsRegressor()
knn.fit(X_train, y_train)

y_test_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

In [None]:
#val_pred = knn.predict(df_validate)

#submission = pd.DataFrame(val_pred, columns=['Purchase'] )

#submission['User_ID'] = df_validate_copy['User_ID']

#submission['Product_ID'] = df_validate_copy['Product_ID']

#submission.to_csv('submission_knn_old.csv')

## hp tuning

In [None]:
knn=KNeighborsRegressor()


params={'n_neighbors':sp_randint(1,50),'p':sp_randint(1,7)}

rsearch = RandomizedSearchCV(knn, param_distributions=params, cv=3,verbose=2,n_iter=30,
                             scoring='neg_root_mean_squared_error', random_state=4, n_jobs=-1)
rsearch.fit(X, y)

In [None]:
#Best parameters
print(rsearch.best_params_)

In [None]:
knn=KNeighborsRegressor(**rsearch.best_params_, random_state=4)
knn.fit(X,y)

y_test_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

# GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

y_test_pred = gbr.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)
rmse = round(np.sqrt(mse), 4)

print(rmse)

In [None]:
%%time

#val_pred = gbr.predict(df_validate)

#submission = pd.DataFrame(val_pred, columns=['Purchase'] )

#submission['User_ID'] = df_validate_copy['User_ID']

#submission['Product_ID'] = df_validate_copy['Product_ID']

#submission.to_csv('submission_gbr.csv')

In [None]:
## XY full

gbr = GradientBoostingRegressor()
gbr.fit(X, y)

y_test_pred = gbr.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)
rmse = round(np.sqrt(mse), 4)

print(rmse)

### tuning

In [None]:
gbr = GradientBoostingRegressor()

params={'n_estimators':sp_randint(50,250),
        'max_depth' : sp_randint(1,50),
        'learning_rate' : sp_uniform(0,0.5),
       'min_samples_leaf' : sp_randint(1,25)}

In [None]:
rsearch = RandomizedSearchCV(gbr, param_distributions=params, cv=3,verbose=2,n_iter=10,
                             scoring='neg_root_mean_squared_error', random_state=4, n_jobs=-1)
rsearch.fit(X, y)

In [None]:
#Best parameters
print(rsearch.best_params_)

In [None]:
gbr = GradientBoostingRegressor(**rsearch.best_params_, random_state=4)
gbr.fit(X_train, y_train)

y_test_pred = gbr.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)

rmse = round(np.sqrt(mse), 4)

print(rmse)

# LGBM hyper parameter gave better RMSE