In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')# Ignoring warnings

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
bfs=pd.read_csv("/kaggle/input/black-friday-sales-prediction/train.csv")

In [None]:
bfs.head()

# **Reading information about data** 

In [None]:
bfs.info()

In [None]:
bfs.columns

In [None]:
bfs.duplicated().value_counts()

We have total 550068 number of rows data including NaN value with no duplicated data. So, first task is to convert the given data into useful data.

In [None]:
bfs.rename(columns=str.lower,inplace=True) #Convertig all the columns name in the lowercase for easy use.

In [None]:
bfs.columns

# **Processing each column individually**

**user_id**

It is nothing but unique customer id's.



In [None]:
bfs.user_id

**product-id**

It is nothing but a unique product id's which are available in store.

In [None]:
bfs.product_id

**gender**

Column contains gender information with ['M','F'] values.

In [None]:
bfs.gender.unique()

In [None]:
bfs.head(2)

**age**

Column contains Age category of customers.

In [None]:
bfs.age.unique()

We have total 7 age category customers data.

'0-17', '18-25', '26-35', '36-45', '46-50', '51-55' and '55+'.

**occupation**

Contains Occupation information of customers in the form of masked value format.

In [None]:
bfs.occupation.unique(),len(bfs.occupation.unique()),bfs.occupation.dtype

We have 21 occupations type of customers

**city_category**

Contains information of cities category wise

In [None]:
bfs.city_category.unique()

**stay_in_current_city_years**

Contains total number of years customer has stayed in current city.

In [None]:
bfs.stay_in_current_city_years.unique()

We have customer who are staying in city from '0', '1', '2', '3', '4+' years.

We can replace '4+' years to '4' so we can get excat number and we can assume that those customer are belonging to current city form 4 years are staying more than 4 years in the current city.

In [None]:
bfs.stay_in_current_city_years=bfs.stay_in_current_city_years.replace("4+",'4').astype(int)
bfs.stay_in_current_city_years.unique()

Here, we got 0, 1, 2, 3, 4 years which are integer values.

**marital_status**

Contains information of customer if they are married or not.

In [None]:
bfs.marital_status.unique()

We assume that 0 means the customer married status is single and 1 means the customer is married.

**product_category**

Contains category wise product value.

In [None]:
bfs.head(2)

We have three product categories columns, product_category_1, product_category_2 and product_category_3 is nothing but the different varities of products.

Ex.:

We can assume that product_category_1 having Electronics items, product_category_2 having Cloths & Grarments, product_category_3 and having Grocery.

In [None]:
bfs[['product_category_1','product_category_2','product_category_3']].info()

Notice that product category having NaN values or empty values. Let's try to fill them by seeing another columns info.

In [None]:
bfs.head(2)

Here we have two columns Gender and occupation. By looking at these two columns we can analyse that which column gives us useful values to fill the nan values of product categories.

Ex.:

By looking at the 'age' column we can analyze that which product category are mostely used by the particular age of customer and we can fill that category value inplace of nan values.
By looking at the 'occupation' column we can analyze that which product category are mostely used by which particular occupation and we can fill that category value inplace of nan values.
In the next steps let's see which column gives us most useful values.

In [None]:
a=bfs.groupby('age')['product_category_2'].agg(pd.Series.mode)
b=bfs.groupby('age')['product_category_3'].agg(pd.Series.mode)
c=bfs.groupby('occupation')['product_category_2'].agg(pd.Series.mode)
d=bfs.groupby('occupation')['product_category_3'].agg(pd.Series.mode)# mode gives us a most repeated value

In [None]:
a

In [None]:
plt.figure(figsize=(18,10))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.4)
plt.subplot(2,2,1)
a.plot(kind='bar',title="age_vs_product_category_2")
plt.subplot(2,2,2)
b.plot(kind='bar',title="age_vs_product_category_3")
plt.subplot(2,2,3)
c.plot(kind='bar',title="occupation_vs_product_category_2")
plt.subplot(2,2,4)
d.plot(kind='bar',title="occupation_vs_product_category_3")
plt.show()

# here we trying to fill only product_category_2 and product_category_3 NaN values because product_category_1 have 0 NaN values.

By comparing both 'age' and 'occupation' colums we can see that on the basis of occupation we are getting more and differnt Product_categories which are mostly repeated.

So, we can fill the nan product_category values corresponding to customer occupation.

In the next step to fill this NaN values let's try to write function which we can use for each product_category.

Insted of repeating the code for each product_category we can use code reusability concept here.

In [None]:
def fill_nan_category(category_colunm):
    x=bfs.groupby('occupation')[category_colunm].agg(pd.Series.mode)
    for i,j in zip(x.index,x):
        bfs.loc[bfs['occupation']==i,category_colunm]=bfs.loc[bfs['occupation']==i,category_colunm].fillna(j)

In [None]:
fill_nan_category('product_category_1')
fill_nan_category('product_category_2')
fill_nan_category('product_category_3')
bfs.product_category_1=bfs.product_category_1.astype(float)

In [None]:
bfs[['product_category_1','product_category_2','product_category_3']].info()

Here, see we succesfully filled the NaN values with proper data.

**purchase**

Contains the purchse value for product by customer.

In [None]:
bfs.purchase.unique(),bfs.purchase.isnull().count()

In [None]:
bfs.info()

In [None]:
bfs.head()

In [None]:
bfs.to_csv('./bfs_clean_data.csv') # Saivng cleaned dataset in output directory.

# **Data Visualization**

In [None]:
bfs.head()

**Check Distribution and Outliers of 'purchase'**

In [None]:
plt.figure(figsize=(15,5))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=1,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.4)
plt.subplot(1,2,1)
sns.kdeplot(x='purchase',data=bfs)
plt.subplot(1,2,2)
sns.boxplot(y='purchase',data=bfs)
plt.show()

We observed that most of the purcahse value is between 5000 to 10000.

**Get purchase details of Male and Female Customers? {'M','F'}**

Find which kind of customer visits more frequently ? Male or Female?{'M','F'}.

In [None]:
bfs.groupby('gender').agg({'gender':'count'}).gender.plot(kind='pie',autopct='%1.1f%%',figsize=(5,5),shadow=True)
plt.show()

We see that 75% Male customers visits us frequently.

Purchase Distribution of 'Male' and 'Female' customers

In [None]:
plt.figure(figsize=(10,4))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=1.2,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.4)
ma=bfs[bfs['gender']=='M'].value_counts('purchase').reset_index()
fe=bfs[bfs['gender']=='F'].value_counts('purchase').to_frame()
plt.subplot(1,2,1)
sns.kdeplot(x='purchase',data=ma).set_title('Male')
plt.subplot(1,2,2)
sns.kdeplot(x='purchase',data=fe).set_title('Female')
plt.show()

Find which kind of customer purchasing more ? Male or Female?{'M','F'}

In [None]:
p=bfs.groupby('gender').agg({'purchase':sum}).reset_index()
sns.barplot(x='gender',y='purchase',data=p)
p

We observed that 'Male' customers are purchasing more.

Find which Product customer purchase most by looking at product_id.

In [None]:
p_id=bfs.value_counts('product_id').sort_values(ascending=False).head(50)
plt.figure(figsize=(5,10))
sns.barplot(y=p_id.index,x=p_id)
plt.show()

Here are the 'top 50' product_id's which are customers buying most.

Find 'sub_product' which is selling most from each product_category and which customer category buying that product most?{'M','F'}.

In [None]:
# Group and count product categories by gender
category_cols = ['product_category_1', 'product_category_2', 'product_category_3']
category_data = []

for category_col in category_cols:
    category_count = bfs.groupby('gender')[category_col].value_counts().rename('p_count').reset_index()
    category_data.append(category_count)

# Create subplots
plt.figure(figsize=(15, 5))
plt.subplots_adjust(left=0.1, bottom=0.1, right=1.2, top=0.9, wspace=0.4, hspace=0.4)

for i, category_col in enumerate(category_cols):
    plt.subplot(1, 3, i + 1)
    sns.barplot(x=category_col, y='p_count', hue='gender', data=category_data[i])

plt.show()

Here we can see, we have three product_categories and on x-axis we have some values that are nothing but sub_products of that product_category in masked format.

The graph shows the sub_product count and the two colours bar shows the customer category.

We observed that some sub_product's selling count is high and most of the 'Male' customers are buying that product.

**Get purchase details of 'Married' and 'UnMarried' customers.{'UnMarried': 0, 'Married':1}**

Purchase Distribution of Married and UnMarried customers

In [None]:
UM_P=bfs[bfs['marital_status']==0].value_counts('purchase').to_frame()
M_P=bfs[bfs['marital_status']==1].value_counts('purchase').to_frame()
plt.figure(figsize=(10,4))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=1.2,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.4)
plt.subplot(1,2,1)
sns.kdeplot(x='purchase',data=UM_P).set_title('UnMarried')
plt.subplot(1,2,2)
sns.kdeplot(x='purchase',data=M_P).set_title('Married')
plt.show()

Purchase distribution of both 'Married' and 'UnMarried' customers are quitely same.

Find which customer visits more frequently ? UnMarried or Married? {'UnMarried': 0, 'Married': 1}

In [None]:
bfs.groupby('marital_status').agg({'marital_status':'count'}).marital_status.plot(kind='pie',autopct='%1.1f%%',figsize=(5,5),shadow=True)
plt.show()

We observed that 'UnMarried' customers Visiting us frequently. And we have 'Married' customer in large amount as well.

Find which customer purchasing more (Married or UnMarried) and Top 10 products purcahsed by them

In [None]:
bfs.head(1)

In [None]:
M_UM_P=bfs.groupby(['gender','marital_status']).agg({'purchase':sum}).reset_index()
UM=bfs[bfs['marital_status']==0].value_counts('product_id').nlargest(10)
M=bfs[bfs['marital_status']==1].value_counts('product_id').nlargest(10)
plt.subplots_adjust(left=1,
                    bottom=1,
                    right=3,
                    top=3,
                    wspace=0.4,
                    hspace=0.4)
plt.subplot(2,2,1)
sns.barplot(x='marital_status',y='purchase',hue='gender',data=M_UM_P)
plt.subplot(2,2,3)
UM.plot(x=UM.index,y=UM,kind='bar',title='UnMarried_Customer')
plt.subplot(2,2,4)
M.plot(x=M.index,y=M,kind='bar',title='Married_Customer')
M_UM_P

In the above plot we can see that 'UnMarried' customers are purchasing more than the 'Married' customers.

And BarPlot shows top 10 Product Id's purshased by them frequently.