In [None]:
import numpy as np
import seaborn
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/My Drive/Black_Friday.csv'
black_df = pd.read_csv(path)

black_df.info()

In [None]:
#defining clean data function
def clean_data(df):
  #removing "+" sign and using integers to represent stay in current city years
  df['Stay_In_Current_City_Years'] = df.Stay_In_Current_City_Years.str.replace('+','').astype(int)

  #removing category age groups to convert it to integers
  #decided to use lowerbound for each age group
  age = {'0-17':0, '18-25': 18, '26-35':26, '36-45':36, '46-50':46, '51-55':51, '55+': 55} 
  df.Age = [age[item] for item in df.Age] 

  #assuming all missing values are 0 and making all necessary changes for product category 2 and 3
  df.fillna(0, inplace= True)
  
  #converting all columns to their appropriate data type
  df['User_ID'] = df['User_ID'].astype(str)
  df['Occupation'] = df['Occupation'].astype(str)
  df['Marital_Status'] = df['Marital_Status'].astype(str)
  df['Product_Category_2'] = df['Product_Category_2'].astype(int)
  df['Product_Category_3'] = df['Product_Category_3'].astype(int)

  return df

In [None]:
#using clean data function to clean black friday file
clean_data(black_df)

In [None]:
#generating bar charts to compare purchase amount by gender and number of customer by gender
gender_purchase = black_df.groupby('Gender')[['Purchase']].sum()
gender_count = black_df.groupby('Gender')[['User_ID']].count()
gender_purchase.plot.bar()
gender_count.plot.bar()

With respect to gender, the dataset is not balanced. Both purchase and count amount are not equal with a difference almost triple between male and female. There are more males and males have purchased more. Both charts looks similar so the difference in purchase is perhaps because there are more male customers.

In [None]:
#generate bar chart to compare purchase amount by city category
seaborn.barplot(x= black_df['City_Category'], y= black_df['Purchase'], data= black_df, estimator= sum)

plt.ticklabel_format(style='plain', axis='y',useOffset=False)

In [None]:
#generate bar chart comparing purchase amount by gender and age
seaborn.barplot(x= black_df['Age'], y= black_df['Purchase'], data= black_df, hue= black_df['Gender'], estimator= sum, palette='coolwarm')
plt.ticklabel_format(style='plain', axis='y',useOffset=False)

Based on the bar chart above, it seems that male spend 3 times as much as females similar to previous charts. If we want to increase spend on black friday, we should focus our efforts on marketing to both genders of age groups 26-35 as this group spends the most money.  

They should push more products that are usually purchased by individuals in their late 20s early 30s for both make and female.

In [None]:
#generating box and whisker to compare distribution of purchase by gender and age
seaborn.boxplot(x= black_df['Age'], y= black_df['Purchase'], data= black_df, hue= black_df['Gender'], palette='coolwarm')

In [None]:
#generating box and whisker to compare distribution of purchase by occupations
seaborn.boxplot(x=black_df['Occupation'], y= black_df['Purchase'], data= black_df, palette='coolwarm')

In [None]:
#generating heatmap to show correlation matrix between each of two features of age, stay in current city, product category 1, product category 2, product category 3, and purchase
cor_df = black_df[['Age', 'Stay_In_Current_City_Years', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3', 'Purchase']]

plt.figure(figsize=(6,6))
seaborn.heatmap(cor_df.corr(), annot=True, cmap='coolwarm')

Based on the correlation heat map, we can conclude that no factor had a strong correlation with purchase whether positive or negative. Product Category 3 had the strongest positive correlation with purchase of all variables (correlation of 0.28)

In [None]:
#generating scatter plots to show correlation matrix between each of two features of age, stay in current city, product category 1, product category 2, product category 3, and purchase
cor_df.to_excel('/content/drive/My Drive/correlation.xlsx')
seaborn.pairplot(cor_df, height=2)

In [None]:
#import necessary packages and assign black friday to new df variable
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

path = '/content/drive/My Drive/Black_Friday.csv'
my_df = pd.read_csv(path)

**Feature engineering**

In [None]:
#use clean data function on database and drop product category 3 as too many missing values in column
clean_data(my_df)
my_df = my_df.drop(['Product_Category_3'], axis=1)

In [None]:
#remove user ID and product ID from database
ml_df = my_df.drop(['User_ID', 'Product_ID'], axis= 1)

#create dummy variables for gender, age, city category and stay in current city years
Gender = pd.get_dummies(ml_df['Gender'])
Age = pd.get_dummies(ml_df['Age'])
City_Category = pd.get_dummies(ml_df['City_Category'])
Stay_In_Current_City_Years = pd.get_dummies(ml_df['Stay_In_Current_City_Years'])

#add dummy variables in the database
ml_df_final = pd.concat([ml_df, Gender, Age, City_Category, Stay_In_Current_City_Years], axis=1)

**Creating the Linear Regression Model**

In [None]:
#assign X and y variables
#drop gender, age, city category, stay in current city years and purchase for X
X = ml_df_final.drop(['Gender', 'Age','City_Category', 'Stay_In_Current_City_Years', 'Purchase'], axis=1)
y = ml_df_final['Purchase']

#split test and train with test size of 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#create linear regression model
model= LinearRegression()
model.fit(X_train, y_train)

In [None]:
#create predictions with X test
model_predictions = model.predict(X_test)
print("Predicted purchases (in dollars) for new costumers:", model_predictions)

#print Mean Absolute Error and Mean Sqaured Error
print('MAE:', metrics.mean_absolute_error(y_test, model_predictions))
print('MSE:', metrics.mean_squared_error(y_test, model_predictions))

**Final Model Evaluation on Black_Friday_Final_project dataset**

In [None]:
# loading the black firday test dataset 
path = '/content/drive/My Drive/Black_Friday_Final_Test.csv'
black_df_final = pd.read_csv(path)

In [None]:
#feature engineering on dataset (clean data function and dropping product category 3)
black_df_final = clean_data(black_df_final)
black_df_final = black_df_final.drop(['Product_Category_3'], axis=1)

In [None]:
#drop user ID and product ID from dataframe
predicted_df = black_df_final.drop(['User_ID', 'Product_ID'], axis= 1)

#create dummy variables for gender, age, city category and stay in current city years
Gender = pd.get_dummies(predicted_df['Gender'])
Age = pd.get_dummies(predicted_df['Age'])
City_Category = pd.get_dummies(predicted_df['City_Category'])
Stay_In_Current_City_Years = pd.get_dummies(predicted_df['Stay_In_Current_City_Years'])

#merge dummy variable into dataframe
predicted_df_final = pd.concat([predicted_df, Gender, Age, City_Category, Stay_In_Current_City_Years], axis=1)

In [None]:
#making predictions on purchase value for Black Friday Test Final using previously trained model
#Dropping gender, age, city category, stay in current city years, and purchase for X variable
X_test_final = predicted_df_final.drop(['Gender', 'Age','City_Category', 'Stay_In_Current_City_Years', 'Purchase'], axis=1)
y_test_final = predicted_df_final['Purchase']
model_predictions_final = model.predict(X_test_final)
print("Predicted purchases (in dollars) for new costumers:", model_predictions_final)

#print Mean Absolute Error and Mean Sqaured Error
print('MAE:', metrics.mean_absolute_error(y_test_final, model_predictions_final))
print('MSE:', metrics.mean_squared_error(y_test_final, model_predictions_final))