# Black Friday 

# Problem: Predict purchase amount.

## Step 1 : Import important libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Step 2 : Import Data file

In [None]:
df = pd.read_csv('../input/black-friday/train.csv')
df_backup = pd.read_csv('../input/black-friday/train.csv')
df.head()

## Step 3 : Data Analysis

In [None]:
df.info()

We saw that inly 2 columns have nan values.

## (i) Gender

In [None]:
%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot

plt.pyplot.hist(df["Gender"])

It is found that man purchase 3 times as women.

In [None]:
dftemp = df[['Gender', 'Purchase']].groupby(['Gender'], as_index=False)
dfG = dftemp.sum()
dfG

In [None]:
dfG.plot(kind='bar')
plt.xlabel = ('Gender')

<u>Thus, in terms of gender it is found that more man prefer to purchase than women on black friday, also man used to buy more expensive products on that day. So, man total purchase is almost 4 times than women.

## (ii) Age

In [None]:
plt.pyplot.hist(df["Age"])

26-35 years of people maximum buy products on this day.

## (iii) Occupation

In [None]:
df['Occupation'].plot(kind='hist', figsize=(20, 5), bins=20)

In [None]:
dftemp = df[['Occupation', 'Purchase']].groupby(['Occupation'], as_index=False)
dfO = dftemp.sum()
dfO

In [None]:
dfO.plot(kind='bar')

We found that 0, 4 occupation were maximum among the people. Thus thier purchase (in terms of occupation) is maximum.
I dont think we should use occupation for final training data.

## (iv) City_Category

In [None]:
plt.pyplot.hist(df["City_Category"])

In [None]:
dftemp = df[['City_Category', 'Purchase']].groupby(['City_Category'], as_index=False)
dfCity = dftemp.sum()
dfCity

In [None]:
dfCity.plot(kind='bar')

We found that category B has maximum frequency thus maximum purchase.

## (v) Stay_In_Current_City_Years

In [None]:
plt.pyplot.hist(df["Stay_In_Current_City_Years"])

In [None]:
dftemp = df[['Stay_In_Current_City_Years', 'Purchase']].groupby(['Stay_In_Current_City_Years'], as_index=False)
dfStay = dftemp.sum()
dfStay

In [None]:
dfStay.plot(kind='bar')

## (vi) Marital_Status 

In [None]:
plt.pyplot.hist(df["Marital_Status"], bins = 20)

In [None]:
dftemp = df[['Marital_Status', 'Purchase']].groupby(['Marital_Status'], as_index=False)
dfMarried = dftemp.sum()
dfMarried

It's found that most of non-married person bought the products on Black Friday.

## (vii) Product_Category_1

In [None]:
plt.pyplot.hist(df["Product_Category_1"], bins = 20)

## (viii) Product_Category_2 

In [None]:
df['Product_Category_2'].unique()

In [None]:
plt.pyplot.hist(df["Product_Category_2"], bins = 20)

In [None]:
df["Product_Category_2"].value_counts()

We found that some categories has highest frequency. 

In [None]:
df['Product_Category_2'].isnull().sum()

Now, we will fill nan values with top three highest occuring values.

In [None]:
# here we filled null values with most frequently occuring values
cnt=0
for i,j in df.iterrows():
    if pd.isnull(j['Product_Category_2']):
        if cnt <= 70000:
            df['Product_Category_2'][i] = '8.0'
            cnt+=1
        elif cnt <=130000:
            df['Product_Category_2'][i] = '14.0'
            cnt+=1
        else :
            df['Product_Category_2'][i] = '2.0'
            cnt+=1
        print(cnt)
                
            

In [None]:
df['Product_Category_2']

In [None]:
df['Product_Category_2'].isnull().sum()

Now, no nan values remaining. Changing data-type of column in necessary to uniform training.

In [None]:
df['Product_Category_2'] = df['Product_Category_2'].astype(int)
df['Product_Category_2'].dtype

## (ix) Product_Category_3

In [None]:
df['Product_Category_3'].unique()

Here, in this column we will perform steps similar to performed in previous column.

In [None]:
df["Product_Category_3"].value_counts()

In [None]:
df['Product_Category_3'].isnull().sum()

In [None]:
cnt=0
for i,j in df.iterrows():
    if pd.isnull(j['Product_Category_3']):
        if cnt <= 125000:
            df['Product_Category_3'][i] = '16.0'
            cnt+=1
        elif cnt <=240000:
            df['Product_Category_3'][i] = '15.0'
            cnt+=1
        elif cnt <= 300000 :
            df['Product_Category_3'][i] = '14.0'
            cnt+=1
        elif cnt <= 345000 :
            df['Product_Category_3'][i] = '17.0'
            cnt+=1
        else :
            df['Product_Category_3'][i] = '5.0'
            cnt+=1
        print(cnt)

In [None]:
df['Product_Category_3'].isnull().sum()

In [None]:
df['Product_Category_3'] = df['Product_Category_3'].astype(int)
df['Product_Category_3'].dtype

Now, no nan values remaining.

In [None]:
df.info()

## Step 4 : Preparing Data for model

In [None]:
y_data = df['Purchase'].copy()
x_data = df.copy()

In [None]:
x_data.info()

Removing unnecessary columns.

In [None]:
x_data.drop(['Purchase', 'User_ID', 'Product_ID'], axis=1, inplace=True)

Converting category columns to labels is necessary.

In [None]:

from sklearn.preprocessing import LabelEncoder
categorical_column = ['Gender','Age','City_Category','Stay_In_Current_City_Years']
le = LabelEncoder()
for i in categorical_column:
    x_data[i] = le.fit_transform(x_data[i])
x_data.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.20, random_state=1)

## Step 5 : Trying different regression technique

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(x_train, y_train)

test_y_hat = lm.predict(x_test)
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_hat - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_hat - y_test) ** 2))

from sklearn.metrics import r2_score
print("R2-score: %.2f" % r2_score(test_y_hat , y_test))

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

test_y_hat = model.predict(x_test)
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_hat - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_hat - y_test) ** 2))

from sklearn.metrics import r2_score
print("R2-score: %.2f" % r2_score(test_y_hat , y_test))

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, objective ='reg:linear')
my_model.fit(x_train,y_train)
predictions = my_model.predict(x_test)

from sklearn.metrics import mean_absolute_error
print("Mean absolute error: %.2f" % np.mean(np.absolute(predictions - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((predictions - y_test) ** 2))
print("Accuracy of train dataset is : ",my_model.score(x_train,y_train))
print("Accuracy of test dataset is : ",my_model.score(x_test,y_test))

from sklearn.metrics import r2_score
print("R2-score: %.2f" % r2_score(predictions, y_test))

Trying different ways to increase accuracy.

In [None]:
x_data_new = x_data.copy()
x_data_new.drop(['Stay_In_Current_City_Years', 'Marital_Status', 'Occupation'], axis=1, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
x_train_n, x_test_n, y_train_n, y_test_n = train_test_split(x_data_new, y_data, test_size=0.20, random_state=1)

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(x_train_n, y_train_n)

test_y_hat = model.predict(x_test_n)
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_hat - y_test_n)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_hat - y_test_n) ** 2))

from sklearn.metrics import r2_score
print("R2-score: %.2f" % r2_score(test_y_hat , y_test_n))

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, objective ='reg:linear')
my_model.fit(x_train_n,y_train_n)
predictions = my_model.predict(x_test_n)

from sklearn.metrics import mean_absolute_error
print("Mean absolute error: %.2f" % np.mean(np.absolute(predictions - y_test_n)))
print("Residual sum of squares (MSE): %.2f" % np.mean((predictions - y_test_n) ** 2))
print("Accuracy of train dataset is : ",my_model.score(x_train_n,y_train_n))
print("Accuracy of test dataset is : ",my_model.score(x_test_n,y_test_n))

from sklearn.metrics import r2_score
print("R2-score: %.2f" % r2_score(predictions, y_test_n))

## Step 6 : Deriving result for test dataset.csv 

In [None]:
x_train_final = x_data.copy()
y_train_final = y_data.copy()

In [None]:
x_test_final = pd.read_csv('../input/black-friday/test.csv')
x_test_final.head()

In [None]:
x_test_final.info()

We need to deal with missimg values in test df.

In [None]:
x_test_final["Product_Category_2"].value_counts()

In [None]:
x_test_final["Product_Category_2"].isnull().sum()

In [None]:
# here we filled null values with most frequently occuring values
cnt=0
for i,j in x_test_final.iterrows():
    if pd.isnull(j['Product_Category_2']):
        if cnt <= 35000:
            x_test_final['Product_Category_2'][i] = '8.0'
            cnt+=1
        elif cnt <=60000:
            x_test_final['Product_Category_2'][i] = '14.0'
            cnt+=1
        else :
            x_test_final['Product_Category_2'][i] = '2.0'
            cnt+=1
        print(cnt)
                

In [None]:
x_test_final['Product_Category_2'] = x_test_final['Product_Category_2'].astype(int)

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical_column = ['Gender','Age','City_Category','Stay_In_Current_City_Years']
le = LabelEncoder()
for i in categorical_column:
    x_test_final[i] = le.fit_transform(x_test_final[i])
x_test_final.head()

In [None]:
x_test_final.drop(['User_ID', 'Product_ID'], axis=1, inplace=True)

Due to lots of missing values, I decided to drop this column.

In [None]:
x_train_final.drop('Product_Category_3', axis=1, inplace=True)
x_test_final.drop('Product_Category_3', axis=1, inplace=True)
x_test_final.info()

In [None]:
x_train_final.info()

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(x_train_final,y_train_final)
predictions = my_model.predict(x_test_final)
print(predictions)

Preparing Dataframe for submission

In [None]:
col_list = ['User_ID', 'Product_ID']
df_submission = pd.read_csv('../input/black-friday/test.csv',usecols=col_list)
df_submission.head()

In [None]:
df_submission['Purchase'] = predictions
df_submission.head()

In [None]:
df_submission.set_index('Purchase', inplace=True)
df_submission.head()

## Saving our dataframe to file

In [None]:
df_submission.to_csv("submission.csv")

# END