In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [11]:
train_path = ('/kaggle/input/black-friday/train.csv')
test_path = ('/kaggle/input/black-friday/test.csv')
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_train

## Joinig test data with train data

In [42]:
df = df_train.append(df_test)
df

In [43]:
df.info()

In [44]:
df.describe()

In [45]:
df.drop(['User_ID'],axis=1,inplace=True)
df.head()

## Feature Engineering / Data Preprocessing

#### Converting categorical data column of Gender into binary form in the dataframe itself

In [46]:
df['Gender'] = df['Gender'].map({'F':0,'M':1})
df.head()

#### Checking categories of Age

In [47]:
df['Age'].unique()

#### Converting Age column into ordinal values in the dataframe itself

In [48]:
#pd.get_dummies(df['Age'],drop_first=True)   # this is target guiding
df['Age'] = df['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7})
df.head()

#### Changing the City_Category column

In [49]:
df_city = pd.get_dummies(df['City_Category'],drop_first=True)
df_city.head()

In [50]:
df = pd.concat([df,df_city],axis=1)
df.head()

In [51]:
df.drop('City_Category',axis=1,inplace=True)
df.head()

### Checking for missing values

In [52]:
df.isnull().sum()

In [54]:
df['Product_Category_2'] = df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])
df.Product_Category_2.isnull().sum()

In [60]:
df['Product_Category_3'] = df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0])
#df.Product_Category_3.isnull().sum()
df.head()

In [61]:
df.shape

In [63]:
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].str.replace('+','')
df

In [64]:
df.info()

### Convert objects into integers

In [65]:
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(int)

In [66]:
df.info()

In [67]:
df['B'] = df['B'].astype(int)
df['C'] = df['C'].astype(int)
df.info()

## Data Visualization

In [68]:
sns.barplot('Age','Purchase', hue='Gender', data=df)

##### Obs: Purchasing of men is high than woman

In [69]:
sns.barplot('Occupation','Purchase', hue='Gender', data=df)

##### Obs: More or less uniform

In [70]:
sns.barplot('Product_Category_1','Purchase', hue='Gender', data=df)

##### Obs: In Product category-1 '10' is bought more in both male & female cases

In [71]:
sns.barplot('Product_Category_2','Purchase', hue='Gender', data=df)

##### Obs: In Product category-2 '10' is bought more in both male & female cases

In [72]:
sns.barplot('Product_Category_3','Purchase', hue='Gender', data=df)

##### Obs: In Product category-3 '3' is bought more in both male & female cases

## Feature Scaling

### Now separating the test & train data from `df`

In [84]:
df_test = df[df['Purchase'].isnull()]

In [80]:
df_train = df[~df['Purchase'].isnull()]

In [86]:
X = df_train.drop('Purchase',axis=1)
y = df_train['Purchase']

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [89]:
X_train.drop('Product_ID',axis=1,inplace=True)
X_test.drop('Product_ID',axis=1,inplace=True)

In [90]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)