In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Black-Friday-Regression-Analysis
Predicting Prices for the products to be sold on Black Friday in US using Regression Analysis, Feature Engineering, Feature Selection, Feature Extraction and Data analysis - Data Visualizations.

Description The dataset here is a sample of the transactions made in a retail store. The store wants to know better the customer purchase behaviour against different products. Specifically, here the problem is a regression problem where we are trying to predict the dependent variable (the amount of purchase) with the help of the information contained in the other variables.

Classification problem can also be settled in this dataset since several variables are categorical, and some other approaches could be "Predicting the age of the consumer" or even "Predict the category of goods bought". This dataset is also particularly convenient for clustering and maybe find different clusters of consumers within it.

# Making Assumptions -
Lets think what factor can affect the purchase of product based on different segment. It could be so many things. So I am segregating this into different groups -

## 1 - City Level Hypotheses:
City Type and Size : Urban or Tier 1 cities should have higher sales because of the higher income levels of people there.

Population Density: Cities with densely populated areas should have higher sales because of more demand.

Younger Population : Cities with younger populations might have higher tendency to spend more on Black Friday
## 2 -Customer Level Hypotheses:
Income: People with higher income should spend more on products.

Age and Gender: Men with ages ranging from 25 to 40 should spend more on techlogical products.

Family Size: Families should be more contained on spendings, just buying the best offers and only needed products.

Purchase History: Customer with a purchase history should be more willing to purchase more products on this day.
## 3 - Store Level Hypotheses:
Location: Stores with a location in well moved streets should have better sales.

Size: Bigger stores with higher stores and variety of products should have better sales
.
Competition: Stores with no competitors near by must have the highest sales.

Marketing: Do stores which spend more on marketing should have the best sales results
## 4 - Product Level Hypotheses:
Category: Most clients should be looking to buy technological products;

Price: Customer will spend more on products with higher discounts

Advertising: More advertised products should sell more

Visibility: More visible products should sell more

Brand: Clients will invest more on already known brands
## Moreover, other questions may be interesting to follow up:

Which type of client spends more?

Which product category and store had the highest sales?

What products usually buy families and single people?

According to age and sex what are the most bought products?

# Analysis step
Trying to identify the most important variables and defining the best regression model for predicting target variable.

Hence, this analysis will be divided into five stages:

Exploratory data analysis (EDA);

Data Pre-processing;

Feature engineering;

Feature Transformation;

Modeling;

Hyperparameter tuning

Ensembling.

In [None]:
import os
print(os.listdir("../input"))

import pandas as pd

train = pd.read_csv('../input/black-friday/train.csv')


In [None]:
train

In [None]:
import os
print(os.listdir("../input"))

import pandas as pd

test = pd.read_csv('../input/black-friday/test.csv')

In [None]:
test

In [None]:
# Importing another libraries

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Understanding the Data

In [None]:
train.head()

# Analysation of Data
If we analyse it individually we see that we do not have any information regarding the stores. Moreover, there is some information related to the customer such as age group, sex, occupation and marital status. On the other hand, we have data on the city’s size and how many years the customer has lived in it whereas on the product’s side there is only information regarding the categories and the amount spent. It is my belief that Gender , Age , City_Category , Product_Category_1 are the predictors that will influence more the amount spent by a customer on this day.

# Explanation of feature
Age : should be treated as numerical. It presents age groups.

City_Category: We can convert this to numerical as well, with dummy variables. Should take a look at the frequency of the values

Occupation : It seems like it has at least 16 different values, should see frequency and try to decrease this value.

Gender: There are possibly two gender, we can make this binary.

Product_ID: Should see if the string “P” means something and if there are other values.

Stay_In_Current_City_Years: We should deal with the ‘+’ symbol.

Product_Category_2 and Product_Category_3 : Have NaN values.

In [None]:
train.info()

# In this data total 12 columns including DV (Purchase) and can see product category 2 , 3 having nun values.
# having 5 string data which we need to encode.

In [None]:
train.describe()

In [None]:
train.columns

In [None]:
train.dtypes

# Check for duplicates

In [None]:
idsUnique = len(set(train.User_ID))
idsTotal = train.shape[0]
idsDupli = idsTotal - idsUnique
print("There are " + str(idsDupli) + " duplicate IDs for " + str(idsTotal) + " total entries")

# Checking the Null Values

In [None]:
# As know from information already that in product category 2, 3 having huge null values
train.isnull().sum()

In [None]:
# Checking proportion of female buyer and male buyer
train['Gender'].value_counts()

In [None]:
# Checking proportion customer based on their marital status.
train['Marital_Status'].value_counts()

In [None]:
# Checking maximun range of purchase
train['Purchase'].max()

In [None]:
# Checking minimum range of purchase
train['Purchase'].min()

# 1 - Exploaratory Data Analysis
 ## Distribution of the target variable: Purchase

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(12,7))
sns.distplot(train.Purchase, bins = 25)
plt.xlabel("Amount spent in Purchase")
plt.ylabel("Number of Buyers")
plt.title("Purchase amount Distribution")

### It seems like our target variable has an almost Gaussian distribution/ Normal Distribution.
# Finding Skewness and Curtosis

In [None]:
print ("Skew is:", train.Purchase.skew())
print("Kurtosis: %f" % train.Purchase.kurt())

Now that we’ve analysed our target variable, let’s consider our predictors(IV). Let’s start by seeing which of our features are numeric.

In [None]:
numeric_features = train.select_dtypes(include=[np.number])
numeric_features.dtypes

# Distribution of the variable Occupation
As seen in the beginning, Occupation has at least 20 different values. Since we do not known to each occupation each number corresponds, is difficult to make any analysis.

In [None]:
sns.countplot(train.Occupation)

# Distribution of the variable Marital_Status
As expected there are more single people buying products on Black Friday than married people, but do they spend more?

In [None]:
sns.countplot(train.Marital_Status)

# Distribution of the variable Product_Category_1
From the distribution for products from category one, it is clear that three products stand out, number 1, 5 and 8. Unfortunately, we do not know which product each number represents.

In [None]:
sns.countplot(train.Product_Category_1)
plt.xticks()

# Distribution of the variable Product_Category_2

In [None]:
sns.countplot(train.Product_Category_2)
plt.xticks(rotation=90)

# Distribution of the variable Product_Category_3

In [None]:
sns.countplot(train.Product_Category_3)
plt.xticks(rotation=90)

# Correlation between Numerical Predictor( IV) and Target variable(DV)

In [None]:
corr = numeric_features.corr()

print (corr['Purchase'].sort_values(ascending=False)[:10], '\n')
print (corr['Purchase'].sort_values(ascending=False)[-10:])

#### There does not seem to be any predictor that would have a high impact on Purchase , since the highest correlation is give by Occupation with 0.0208. On the other hand, Product_Category_1 has a negative correlation with our target with the value -0.3437 which is somehow odd

In [None]:
#correlation matrix
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corr,  annot=True,annot_kws={'size': 15})

##### There seems to be no multicollinearity with our predictors which is a good thing, although there is some correlation among the product categories

In [None]:
s = corr.unstack()
s

# Categorical Predictors
## Distribution of the variable Gender
Most of the buyers are males, but who spends more on each purchase: man or woman?

In [None]:
sns.countplot(train.Gender)

# Distribution of the variable Age

In [None]:
sns.countplot(train.Age)

# As expected, most purchases are made by people between 18 to 45 years old.

# Distribution of the variable City_Category

In [None]:
sns.countplot(train.City_Category)

## Distribution of the variable Stay_In_Current_City_Years
The tendency looks like the longest someone is living in that city the less prone they are to buy new things. Hence, if someone is new in town and needs a great number of new things for their house that they’ll take advantage of the low prices in Black Friday to purchase all the things needed.

In [None]:
sns.countplot(train.Stay_In_Current_City_Years)

# Bivariate Analysis¶
now it is time to understand the relationship between our target variable and predictors as well as the relationship among predictors.
## 1 -Occupation and Purchase analysis
Although there are some occupations which have higher representations, it seems that the amount each user spends on average is more or less the same for all occupations. Of course, in the end, occupations with the highest representations will have the highest amounts of purchases.

In [None]:
Occupation_pivot = \
train.pivot_table(index='Occupation', values="Purchase", aggfunc=np.mean)


In [None]:
Occupation_pivot

In [None]:
Occupation_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Occupation")
plt.ylabel("Purchase")
plt.title("Occupation and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

# 2 - Marital_Status and Purchase Analysis
We had more single customers than married. However, on average an individual customer tends to spend the same amount independently if his/her is married or not

In [None]:
Marital_Status_pivot = \
train.pivot_table(index='Marital_Status', values="Purchase", aggfunc=np.mean)

In [None]:
Marital_Status_pivot

In [None]:
Marital_Status_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Marital_Status")
plt.ylabel("Purchase")
plt.title("Marital_Status and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

# 3 - Product_category_1and Purchase analysis
If you see the value spent on average for Product_Category_1 you see that although there were more products bought for categories 1,5,8 the average amount spent for those three is not the highest. It is interesting to see other categories appearing with high purchase values despite having low impact on sales number.

In [None]:
Product_category_1_pivot = train.pivot_table(index='Product_Category_1', values="Purchase", aggfunc=np.mean)

In [None]:
Product_category_1_pivot

In [None]:
Product_category_1_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Product_Category_1")
plt.ylabel("Purchase")
plt.title("Product_Category_1 and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

In [None]:
Product_category_1_pivot = train.pivot_table(index='Product_Category_1', values="Purchase", aggfunc=np.sum)
Product_category_1_pivot

In [None]:
Product_category_1_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Product_Category_1")
plt.ylabel("Purchase")
plt.title("Product_Category_1 and Purchase Analysis " "SUM")
plt.xticks(rotation=0)
plt.show()

# 4 - Product_Category_2 and Purchase Analysis

In [None]:
Product_category_2_pivot = train.pivot_table(index='Product_Category_2', values="Purchase", aggfunc=np.mean)

In [None]:
Product_category_2_pivot

In [None]:
Product_category_2_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Product_Category_2")
plt.ylabel("Purchase")
plt.title("Product_Category_2 and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

In [None]:
Product_category_2_pivot = train.pivot_table(index='Product_Category_2', values="Purchase", aggfunc=np.sum)

In [None]:
Product_category_2_pivot

In [None]:
Product_category_2_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Product_Category_2")
plt.ylabel("Purchase")
plt.title("Product_Category_2 and Purchase Analysis "  "SUM")
plt.xticks(rotation=0)
plt.show()

# 5 - Product_Category_3 and Purchase Analysis

In [None]:
Product_category_3_pivot = train.pivot_table(index='Product_Category_3', values="Purchase", aggfunc=np.mean)
Product_category_3_pivot

In [None]:
Product_category_3_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Product_Category_3")
plt.ylabel("Purchase")
plt.title("Product_Category_3 and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

In [None]:
Product_category_3_pivot = train.pivot_table(index='Product_Category_3', values="Purchase", aggfunc=np.sum)
Product_category_3_pivot

In [None]:
Product_category_3_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Product_Category_3")
plt.ylabel("Purchase")
plt.title("Product_Category_3 and Purchase Analysis " "SUM")
plt.xticks(rotation=0)
plt.show()

# BIVARIATE ANALYSIS based on Categorical Variables
## 6 -Gender and Purchase analysis
##### On average the male gender spends more money on purchase contrary to female, and it is possible to also observe this trend by adding the total value of purchase. This last conclusion is more reasonable since the percentage of male buyers is higher than female buyers.

In [None]:
gender_pivot = train.pivot_table(index='Gender', values="Purchase", aggfunc=np.mean)
gender_pivot


In [None]:
gender_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Gender")
plt.ylabel("Purchase")
plt.title("Gender and Purchase Analysis " "AVERAGE")
plt.xticks(rotation=0)
plt.show()

# 7 -Age and Purchase Analysis
Total amount spent in purchase is in accordance with the number of purchases made, distributed by age.

In [None]:
age_pivot = train.pivot_table(index='Age', values="Purchase", aggfunc=np.sum)
age_pivot


In [None]:
age_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Age")
plt.ylabel("Purchase")
plt.title("Age and Purchase Analysis " "SUM")
plt.xticks(rotation=0)
plt.show()

# 8 - City_Category and Purchase analysis
We saw previously that city type ‘B’ had the highest number of purchases registered. However, the city whose buyers spend the most is city type ‘C’.

In [None]:
city_pivot = train.pivot_table(index='City_Category', values="Purchase", aggfunc=np.mean)
city_pivot


In [None]:
city_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("City_Category")
plt.ylabel("Purchase")
plt.title("City_Category and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

# 9 - Stay_In_Current_City_Years and Purchase analysis
Again, we see the same pattern seen before which show that on average people tend to spend the same amount on purchases regardeless of their group. People who are new in city are responsible for the higher number of purchase, however looking at it individually they tend to spend the same amount independently of how many years the have lived in their current city.

In [None]:
Stay_In_Current_City_Years_pivot = train.pivot_table(index='Stay_In_Current_City_Years', values="Purchase", aggfunc=np.mean)
Stay_In_Current_City_Years_pivot


In [None]:
Stay_In_Current_City_Years_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Stay_in_Current_City_Years")
plt.ylabel("Purchase")
plt.title("Stay_in_Current_City_Years and Purchase Analysis")
plt.xticks(rotation=0)
plt.show()

# 2 - Data Pre-Processing
Usually, datasets for every challenge such as those presented in Analytics Vidhya or Kaggle come seperated as a train.csv and a test.csv. It is generally a good idea to combine both sets into one, in order to perform data cleaning and feature engineering and later divide them again. With this step we do not have to go through the trouble of repeting twice the same code, for both datasets. Let’ s combine them into a dataframe datawith a sourcecolumn specifying where each observation belongs

In [None]:
# Join Train and Test Dataset
train['source']='train'
test['source']='test'

data = pd.concat([train,test], ignore_index = True, sort = False)

print(train.shape, test.shape, data.shape)

In [None]:
#Check the percentage of null values per variable

# 31% approx in product cat 2 and 69% approx in product cat 3 data is missing

data.isnull().sum()/data.shape[0]*100

# Imputing the value Zero

In [None]:
data["Product_Category_2"]= data["Product_Category_2"].fillna(-2.0).astype("float")

data.Product_Category_2.value_counts().sort_index()

In [None]:
data.isnull().sum()

In [None]:
data.Product_Category_3.value_counts().sort_index()

In [None]:
data["Product_Category_3"]= \
data["Product_Category_3"].fillna(-2.0).astype("float")

In [None]:
data.Product_Category_3.value_counts().sort_index()

In [None]:
data.isnull().sum()

In [None]:
# Removing Product_Category_1 group 19 and 20 from Train as this is not in Product_Category_2 and 3

In [None]:
#Get index of all columns with product_category_1 equal 19 or 20 from train

condition = data.index[(data.Product_Category_1.isin([19,20])) & (data.source == "train")]
data = data.drop(condition)

In [None]:

data.shape

In [None]:
# Categorical Values

In [None]:
#Apply function len(unique()) to every data variable

data.apply(lambda x: len(x.unique()))

In [None]:
# Frequency Analysis

In [None]:
#Filter categorical variables and get dataframe will all strings columns names except Item_identfier and outlet_identifier
category_cols = data.select_dtypes(include=['object']).columns.drop(["source"])
#Print frequency of categories
for col in category_cols:
    #Number of times each value appears in the column
    frequency = data[col].value_counts()
    print("\nThis is the frequency distribution for " + col + ":")
    print(frequency)

# 3. Feature Engineering¶
## 1. Converting gender to binary

In [None]:
#Turn gender binary
gender_dict = {'F':0, 'M':1}
data["Gender"] = data["Gender"].apply(lambda line: gender_dict[line])

data["Gender"].value_counts()

# 2. Converting Age to numeric values

In [None]:
# Giving Age Numerical values
age_dict = {'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6}
data["Age"] = data["Age"].apply(lambda line: age_dict[line])

data["Age"].value_counts()

# 3. Converting city_category to binary

In [None]:
city_dict = {'A':0, 'B':1, 'C':2}
data["City_Category"] = data["City_Category"].apply(lambda line: city_dict[line])

data["City_Category"].value_counts()

# 4. Converting Stay_In_Current_City_Years to binary

In [None]:
#Import library:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#New variable for outlet
data['Stay_In_Current_City_Years'] = le.fit_transform(data['Stay_In_Current_City_Years'])
    
#Dummy Variables:
data = pd.get_dummies(data, columns=['Stay_In_Current_City_Years'])

data.dtypes

# 5. Function to create count features

In [None]:
# feature representing the count of each user
def getCountVar(compute_df, count_df, var_name):
    grouped_df = count_df.groupby(var_name)
    count_dict = {}
    for name, group in grouped_df:
        count_dict[name] = group.shape[0]

    count_list = []
    for index, row in compute_df.iterrows():
        name = row[var_name]
        count_list.append(count_dict.get(name, 0))
    return count_list

In [None]:
#data["User_ID_Count"]  = getCountVar(data, data, "User_ID")
data["Age_Count"]  =getCountVar(data, data, "Age")
data["Occupation_Count"]  =getCountVar(data, data, "Occupation")
data["Product_Category_1_Count"]  =getCountVar(data, data,"Product_Category_1")
data["Product_Category_2_Count"]  =getCountVar(data, data, "Product_Category_2")
data["Product_Category_3_Count"]  =getCountVar(data, data,"Product_Category_3")
data["Product_ID_Count"]  =getCountVar(data, data, "Product_ID")

# 6.Exporting Data

In [None]:
#Divide into test and train:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

#Drop unnecessary columns:
test.drop(['source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

#Export files as modified versions:
train.to_csv("train_modified.csv",index=False)
test.to_csv("test_modified.csv",index=False)

In [None]:
# 4. Model

In [None]:
train_df = pd.read_csv('train_modified.csv')
test_df = pd.read_csv('test_modified.csv')

In [None]:
#Define target and ID columns:
target = 'Purchase'
IDcol = ['User_ID','Product_ID']
from sklearn import model_selection, metrics


def modelfit(alg, dtrain, dtest, predictors, target, IDcol, filename):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])

    #Perform model_selection:
    cv_score = model_selection.cross_val_score(alg, dtrain[predictors],(dtrain[target]) , cv=20, scoring='neg_mean_squared_error')
    cv_score = np.sqrt(np.abs(cv_score))
    
    #Print model report:
    print("\nModel Report")
    print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((dtrain[target]).values, dtrain_predictions)))
    print("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
    
    #Predict on testing data:
    dtest[target] = alg.predict(dtest[predictors])
    
    #Export submission file:
    IDcol.append(target)
    submission = pd.DataFrame({ x: dtest[x] for x in IDcol})
    submission.to_csv(filename, index=False)

In [None]:

from sklearn.linear_model import LinearRegression
LR = LinearRegression(normalize=True)

predictors = train_df.columns.drop(['Purchase','Product_ID','User_ID'])
modelfit(LR, train_df, test_df, predictors, target, IDcol, 'LR.csv')

coef1 = pd.Series(LR.coef_, predictors).sort_values()

In [None]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)
modelfit(DT, train_df, test_df, predictors, target, IDcol, 'DT.csv')

coef3 = pd.Series(DT.feature_importances_, predictors).sort_values(ascending=False)

# 5. Conclusion
The ML algorithm that perform the best was Decision Tree Model with RMSE = 2680 which got me in the first 42%