In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Kaggle divides the dataset already into Train and Test data
train = pd.read_csv("../input/black-friday/train.csv")
test = pd.read_csv("../input/black-friday/test.csv")

In [None]:
train.shape,test.shape

In [None]:
User_id,Product_ID, TARGET_COL = 'User_ID','Product_ID','Purchase'
features = [c for c in train.columns if c not in [User_id,Product_ID, TARGET_COL]]

cat_cols = ['Gender','Age','City_Category','Stay_In_Current_City_Years']

num_cols = [c for c in features if c not in cat_cols]

In [None]:
train.info()

In [None]:
test.info()

### Purchase is our target variable

###  Exploratory Data Analysis (EDA)

#### Univariate Analysis

In [None]:
#  Distribution of the target variable: Purchase
sns.distplot(train.Purchase, bins = 25)

It seems like our target variable has an almost Gaussian distribution.

In [None]:
print ('Skew is:', train.Purchase.skew())
print('Kurtosis: %f' % train.Purchase.kurt())

### Numerical Predictors

In [None]:
numeric_features = train.select_dtypes(include=[np.number])
numeric_features.dtypes

In [None]:
#Distribution of the variable Occupation
sns.countplot(train.Occupation)

##### As seen in the beginning, Occupation has at least 20 different values. Since we do not known to each occupation each number corresponds, is difficult to make any analysis. Furthermore, it seems we have no alternative but to use since there is no way to reduce this number as we did on Project Bigmart with Item_Type.

In [None]:
# Distribution of the variable Marital_Status
sns.countplot(train.Marital_Status)

##### As expected there are more single people buying products on Black Friday than married people, but do they spend more?

In [None]:
#Distribution of the variable Product_Category_1
sns.countplot(train.Product_Category_1)

#### From the distribution for products from category one, it is clear that three products stand out, number 1, 5 and 8. Unfortunately, we do not know which product each number represents.

In [None]:
sns.countplot(train.Product_Category_2)

In [None]:
sns.countplot(train.Product_Category_3)

### Correlation between Numerical Predictors and Target variable

In [None]:
corr = numeric_features.corr()
#correlation matrix
f, ax = plt.subplots(figsize=(15, 6))
sns.heatmap(corr, vmax=.8,annot_kws={'size': 20}, annot=True);

##### There seems to be no multicollinearity with our predictors which is a good thing, although there is some correlation among the product categories. Are category 2 and 3 necessary? Can we dispose them?

### Categorical Predictors

In [None]:
cat_features = train.select_dtypes(include=[np.object])
cat_features.dtypes

In [None]:
#Distribution of the variable Gender
sns.countplot(train.Gender)

#### Most of the buyers are males, but who spends more on each purchase: man or woman?

In [None]:
# Distribution of the variable Age
sns.countplot(train.Age)

#### As expected, most purchases are made by people between 18 to 45 years old.

In [None]:
#Distribution of the variable City_Category
sns.countplot(train.City_Category)

##### Supposing ‘A’ represents the biggest city whereas ‘C’ the smallest, it curious to see that the medium size cities ‘B’ had higher sales than the others. But do they also spent more?

In [None]:
##Distribution of the variable Stay_In_Current_City_Years
sns.countplot(train.Stay_In_Current_City_Years)

## Bivariate Analysis

####  Numerical Variables

In [None]:
#Occupation and Purchase analysis
Occupation_pivot = train.pivot_table(index='Occupation', values="Purchase", aggfunc=np.mean)
Occupation_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Occupation")
plt.ylabel("Purchase")
plt.title("Occupation and Purchase Analysis")

In [None]:
## Marital_Statusand Purchase analysis
martial_pivot = train.pivot_table(index='Marital_Status', values="Purchase", aggfunc=np.mean)
martial_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Maretial Status")
plt.ylabel("Purchase")
plt.title("Maretial and Purchase Analysis")

In [None]:
## Product_category_1and Purchase analysis
Product_category_1_pivot = train.pivot_table(index='Product_Category_1', values="Purchase", aggfunc=np.mean)
Product_category_1_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Product_Category_1")
plt.ylabel("Purchase")
plt.title("Product_Category_1 and Purchase Analysis")

###  Categorical Variables

In [None]:
## Gender and Purchase analysis
gender_pivot = train.pivot_table(index='Gender', values="Purchase", aggfunc=np.mean)
gender_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Gender")
plt.ylabel("Purchase")
plt.title("Gender and Purchase Analysis")

In [None]:
##  Age and Purchase analysis
age_pivot = train.pivot_table(index='Age', values="Purchase", aggfunc=np.mean)
age_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Gender")
plt.ylabel("Purchase")
plt.title("Gender and Purchase Analysis")

In [None]:
## City_Category and Purchase analysis
City_Category_pivot = train.pivot_table(index='City_Category', values="Purchase", aggfunc=np.mean)
City_Category_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("City_Category")
plt.ylabel("Purchase")
plt.title("Gender and Purchase Analysis")

In [None]:
## Stay_In_Current_City_Years and Purchase analysis
Stay_In_Current_City_Years_pivot = train.pivot_table(index='Stay_In_Current_City_Years', values="Purchase", aggfunc=np.mean)
Stay_In_Current_City_Years_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Stay_In_Current_City_Years")
plt.ylabel("Purchase")
plt.title("Stay_In_Current_City_Years and Purchase Analysis")

In [None]:
train.info()

###  Data Pre-Processing

In [None]:
# Join Train and Test Dataset
train['source']='train'
test['source']='test'
data = pd.concat([train,test], ignore_index = True, sort = False)
print(train.shape, test.shape, data.shape)

#### Looking for missing values

In [None]:
#Check the percentage of null values per variable
data.isnull().sum()/data.shape[0]*100

#### The only predictors having missing value are Product_Category_1 and Product_Category_2 . We can either try to impute the missing values or drop these predictors. We can text both approaches to see which returns the best results.

In [None]:
data['Product_Category_2'] = data['Product_Category_2'].fillna(data['Product_Category_2'].median())
data['Product_Category_3'] = data['Product_Category_3'].fillna(data['Product_Category_3'].median())

In [None]:
data.isnull().sum()/data.shape[0]*100

#### Dealing With Categorical Variable

In [None]:
cat_features = data.select_dtypes(include=[np.object])
cat_features.head()

In [None]:
#Converting Gender to binary
#Turn gender binary
data['Gender'] = data['Gender'].replace(("M", "F"),(0,1))

In [None]:
#city_category
data['City_Category'] = data['City_Category'].replace(("A", "B", "C"),(0,1,2))

In [None]:
#Stay_In_Current_City_Years
data['Stay_In_Current_City_Years']= data['Stay_In_Current_City_Years'].replace(("0", "1", "2","3","4+"),
                                                (0,1,2,3,4))

In [None]:
data['Age'].value_counts()

In [None]:
#Stay_In_Current_City_Years
data['Age']= data['Age'].replace(("26-35", "36-45", "18-25","46-50","51-55","55+","0-17"),
                                                (2,3,1,4,5,6,0))

In [None]:
data.info()

In [None]:
data = data.drop(['User_ID', 'Product_ID'], axis = 1)

### Exporting Data

In [None]:
#Divide into test and train:
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']
#Drop unnecessary columns:
test.drop(['source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

### model build

In [None]:
train_df = train.copy()
test_df = test.copy()

In [None]:
train_df.info()

In [None]:
test_df =test_df.drop(['Purchase'], axis = 1)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
# lets split the target data from the train data

y = train_df['Purchase']
X = train_df.drop(['Purchase'], axis = 1)
x_test = test_df

# lets print the shapes of these newly formed data sets
print("Shape of the x :", X.shape)
print("Shape of the y :", y.shape)
print("Shape of the test data :", x_test.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
# lets print the shapes again 
print("Shape of the X Train :", X_train.shape)
print("Shape of the y Train :", y_train.shape)
print("Shape of the X test :", X_test.shape)
print("Shape of the y test :", y_test.shape)
print("Shape of the test data :", x_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
x_test = sc.transform(x_test)

### Machine Learning Regressive Modelling

In [None]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from math import sqrt

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(n_jobs=10)
lr.fit(X_train,y_train)

y_test_pred_rfr = lr.predict(X_test)

print("RMSE : " , np.sqrt(mean_squared_error(y_test,y_test_pred_rfr)))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(max_depth=8, min_samples_leaf=150)
rfr.fit(X_train,y_train)

y_pred_rfr = rfr.predict(X_test)

print("Training Accuracy :", rfr.score(X_train, y_train))
print("Testing Accuracy :", rfr.score(X_test, y_test))

rms_rf = sqrt(mean_squared_error(y_test, y_pred_rfr))
print("The Rmse value For RandomForest is ",rms_rf)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Training Accuracy :", dt.score(X_train, y_train))
print("Testing Accuracy :", dt.score(X_test, y_test))

rms_dt = sqrt(mean_squared_error(y_test, y_pred_dt))
print("The Rmse value For Decission Tree is ",rms_dt)

