# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(rc={'figure.figsize':[8,8]},font_scale=1.3)

# Read File and view data details

In [None]:
black_friday = pd.read_csv('../input/blackfriday/BlackFriday.csv')

In [None]:
black_friday.head(10)

In [None]:
black_friday['Product_ID'] = black_friday['Product_ID'].apply(lambda x : x.split('P')[-1])

In [None]:
black_friday.head(10)

In [None]:
black_friday.tail()

# 1.Data Cleansing
**check for any missing indicators, NAN Values and any weird data**

In [None]:
black_friday.info()

**Dtypes are good , NAN values are only in Product_Category_2 & Product_Category_3**

In [None]:
black_friday.describe()

In [None]:
black_friday.corr()

In [None]:
black_friday['User_ID'].value_counts()

In [None]:
for col in black_friday.columns:
    print(black_friday[col].value_counts())
    print('*' * 50)

No Weird Data or missing indicators

# 2.Fill Missing Data

In [None]:
black_friday.isnull().sum()

Fill Product_Category_2 & Product_Category_3 using Simple Imputer 

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='most_frequent')

In [None]:
black_friday[['Product_Category_2','Product_Category_3']] = imputer.fit_transform(black_friday[['Product_Category_2','Product_Category_3']])

In [None]:
imputer.statistics_

In [None]:
black_friday.isnull().sum()

# 3.Data Analysis

In [None]:
black_friday.head()

In [None]:
sns.countplot(black_friday['Gender'])

In [None]:
sns.countplot(black_friday['Marital_Status'])

In [None]:
sns.countplot(black_friday['Age'])

In [None]:
sns.countplot(black_friday['Age'],hue=black_friday['Gender'])

In [None]:
sns.countplot(black_friday['City_Category'])

In [None]:
sns.countplot(black_friday['Occupation'])

In [None]:
sns.countplot(black_friday['Stay_In_Current_City_Years'])

In [None]:
sns.countplot(data=black_friday,x='Stay_In_Current_City_Years',hue='Gender',palette='viridis')

In [None]:
sns.countplot(black_friday['Product_Category_1'])

In [None]:
sns.countplot(black_friday['Product_Category_2'])

In [None]:
sns.countplot(black_friday['Product_Category_3'])

In [None]:
sns.boxplot(x=black_friday['Gender'],y=black_friday['Purchase'])

In [None]:
sns.violinplot(x=black_friday['Age'],y=black_friday['Purchase'],hue=black_friday['Gender'],split=True)

In [None]:
sns.barplot(x=black_friday['Gender'],y=black_friday['Purchase'])

In [None]:
sns.barplot(x=black_friday['Age'],y=black_friday['Purchase'],hue=black_friday['Gender'])

In [None]:
sns.barplot(data=black_friday,x='Product_Category_1',y='Purchase')

In [None]:
sns.barplot(data=black_friday,x='Product_Category_2',y='Purchase')

In [None]:
sns.barplot(data=black_friday,x='Product_Category_3',y='Purchase')

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(black_friday.corr(),annot=True)

Positive Correlation between : (Product_Category_1 , Product_Category_2)  
slightly Positive Correlation between : (Product_Category_1 , Product_Category_3)  
slightly negative Correlation between : (Product_Category_1 , Purchase)  
Positive Correlation between : (Product_Category_2 , Product_Category_3)  
slightly negative Correlation between : (Product_Category_2 , Purchase)

# 4.Categorical Data

In [None]:
black_friday.head()

**Categorical Data**   
1. Ordinal (Need Mapping)
    1. Stay_In_Current_City_Years  
    
2. Ordinal (Already Mapped)
    1. Occupation
    2. Marital_Status
    3. Product_Category_1
    4. Product_Category_2
    5. Product_Category_3
    
3. Nominal
    1. Age
    2. Gender
    3. City_Category

In [None]:
black_friday['Stay_In_Current_City_Years'].value_counts()

In [None]:
Stay_In_Current_City_Years_dict = {'0':0, '1':1, '2':2, '3':3, '4+':4}
#will change only 4+ to 4 to use it later in machine learning

# apply using map
black_friday['Stay_In_Current_City_Years'] = black_friday['Stay_In_Current_City_Years'].map(Stay_In_Current_City_Years_dict)

In [None]:
black_friday = pd.get_dummies(black_friday, columns=['Age', 'Gender', 'City_Category'], drop_first=True)

In [None]:
black_friday

# 5.Split Data

**drop unwanted columns**

In [None]:
black_friday.drop(['User_ID','Product_ID'],inplace=True,axis=1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = black_friday.drop('Purchase',axis=1)
y = black_friday['Purchase']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [None]:
print(X_train.shape , X_test.shape , y_train.shape, y_test.shape)

# 6.Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ss = StandardScaler()

In [None]:
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)