# Data Acquisition 

1. Data collected by conducting a survey between 10th June 2021 - 18th June 2021 
2. Survey Link:- https://forms.gle/4LBRAxRbxx7DQFnTA
3. Total 254 respondents took part in the survey 

# Import libraries

In [None]:
# standard imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# sklearn imports 
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,f1_score,plot_roc_curve,precision_score,recall_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# some more imports
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier
from collections import Counter

In [None]:
PATH_CSV='Customer_Behaviour_Survey_responses.csv'
COL_NAMES=['Timestamp','Age','Gender','State','City','Tier','Martial_Status','Financial_Status','Cost_over_brand','Product_Category',
            'Time_to_buy','Shopping_monthly_Freq','Avg_Money_spend','Electronics_subproducts','Fashion_subproducts','Clothing_subproducts','OTT_platforms','Books_subcategory']
df=pd.read_csv(PATH_CSV,sep=',',
            header=0,
            names=COL_NAMES,
            na_values=np.NaN,
            )

In [None]:
df.head()

# Data Cleaning

In [None]:
df.info()

#### Age

In [None]:
df.Age.unique()

In [None]:
age=list(df.Age.values)
age.remove('55 years')
age.remove('Anurag Dubey')
age = [int(a) for a in age]
median_age=int(np.median(age))
print(median_age)

In [None]:
# looks like someone has written their name in the age column , we will replace it by the median value of age
df['Age']=['26' if age=='Anurag Dubey' else age for age in list(df['Age'].values)]

In [None]:
# Looks like someone has filled age as "55 years" , we will simply convert it into 55 as other values
nums_age=[int(age.split(' ')[0]) for age in list(df.Age.values)]

In [None]:
df['Age']=nums_age

In [None]:
df.Age.unique()

#### Gender

In [None]:
df.Gender.value_counts()

#### State & City

In [None]:
df.State.unique()

In [None]:
df.City.unique()

In [None]:
df.isnull().sum()

1. Looks like we have 31 null values in city column , plus we have lot of unique values for city columns thus it may not contribute much in our ML model,we may drop city column. 

2. Null columns in subproducts cols are there because of the way survey was conducted. We need to merge all the subproducts columns into one. 

#### Cleaning other columns

In [None]:
# checking the type whether it is int or not.
df.Shopping_monthly_Freq.dtype

In [None]:
df.Avg_Money_spend.value_counts()

In [None]:
df.Time_to_buy.value_counts()
# looks like there are 2 respones which are different from the rest. We will replace them with the most popular answer category wise. 

In [None]:
def replace_unusual_values_in_time_to_buy(df,unusual_val):
  mode_val_time_to_buy=df.groupby(['Product_Category'])['Time_to_buy'].agg(pd.Series.mode).to_dict()
  val= mode_val_time_to_buy[df[df.Time_to_buy==unusual_val].Product_Category.values[0]]
  df.Time_to_buy=df.Time_to_buy.replace(unusual_val,val)
  return df

In [None]:
df=replace_unusual_values_in_time_to_buy(df,'Sometimes I keep.in a wishlist nf')
df=replace_unusual_values_in_time_to_buy(df,'Depends on type of product')
df=replace_unusual_values_in_time_to_buy(df,'One week ')

In [None]:
df.Time_to_buy.value_counts()
# Done!

#### Cleaning Product Category 

In [None]:
df.Product_Category.value_counts()

1. Since we had other options in product cateogry to get user input also , what they bought very frequently , we need to clean up this column. 

2. One way of doing is to map Food items , daily use items , Home decor , household goods , kitchen utensils etc -> "Household Goods"

3. Netflux just appears to be a spelling mistake , we just map it to OTT subscriptions. 

In [None]:
cats=['OTT Subscriptions ( Netflix , Amazon , prime video etc )' if cat=="Netflux" else cat for cat in df.Product_Category.values ] # correcting spelling mistake 
new_cats=[]  # creating a new product cateogory "Household goods by combining several other small categories"
household_goods = ['Food','Food items ','Daily use items','Home decor ','Kitchen utensils ','Household goods ','Kitchen utensils',
                   'Household','Food Products ','Food products','Liquor','Grocery','Daily needs','Household products','Home products','House products','Daily use items ' ]
for cat in cats:
  if cat in household_goods:
    new_cats.append('Household Goods')
  else:
    new_cats.append(cat)

In [None]:
df.Product_Category=new_cats

In [None]:
df.Product_Category.value_counts()

#### Removing null values from dataset 

In [None]:
SUB_PRODCUTS_COLS=['Electronics_subproducts', 'Fashion_subproducts',
       'Clothing_subproducts', 'OTT_platforms', 'Books_subcategory']
df.fillna('',inplace=True)

In [None]:
df.head()

In [None]:
elec_subproducts=list((df.Electronics_subproducts))
fash_subproducts=list(df.Fashion_subproducts)
books_subproducts=list(df.Books_subcategory)
cloth_subproducts=list(df.Clothing_subproducts)
OTT_subproducts=list(df.OTT_platforms)

In [None]:
sub_products=[]
for e,f,b,c,o in zip(elec_subproducts,fash_subproducts,books_subproducts,cloth_subproducts,OTT_subproducts):
  if e !='':
    sub_products.append(e)
  elif f !='':
    sub_products.append(f)
  elif b !='':
    sub_products.append(b)
  elif c !='':
    sub_products.append(c)
  elif o !='':
    sub_products.append(o)
  else:
    sub_products.append('household goods') # most of the other category is household goods 

In [None]:
df.drop(SUB_PRODCUTS_COLS,axis=1,inplace=True)
df['Subproducts']=sub_products

1. We also drop timestamp and city column as they seems to be not useful for our ML model. 

2. Timestamp refers to the time and date when the person filled the survey 

3. City is the city of the person who filled the survey , we also have city tier column in our dataset which appears to be a better column for a ML model thus we drop city 

4. plus city column also has null values which will cause problems later on 

In [None]:
clean_df=df.drop(['Timestamp','City'],axis=1)

## We have our clean dataset ready! 

In [None]:
clean_df.tail(5)

# Exploratory data analysis (EDA)

In [None]:
clean_df.head(5)

In [None]:
clean_df.describe()

## Age 

In [None]:
def pdf(x):
    mean = np.mean(x)
    std = np.std(x)
    y_out = 1/(std * np.sqrt(2 * np.pi)) * np.exp( - (x - mean)**2 / (2 * std**2))
    return y_out

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,5))
fig.suptitle('Bell curve for Age vs Log(Age)')
plt.style.use('seaborn')
# Plotting Bell curve for Age  
x = clean_df['Age']
y = pdf(x)
x_fill = x
y_fill = pdf(x_fill)
ax1.scatter(x, y, marker = 'o',
            s = 25, color = 'red')
ax1.set(xlabel='Age',ylabel='PDF(Age)')
ax1.set_title('Bell curve of Age')

# Plotting Bell curve for log(Age)
x = np.log(clean_df['Age'])
y = pdf(x)
x_fill = x
y_fill = pdf(x_fill)
ax2.scatter(x, y, marker = 'o',
            s = 25, color = 'green')
ax2.set(xlabel='Log(Age)',ylabel='PDF(Age)')
ax2.set_title('Bell curve of Log(Age)')

plt.show();

*   We will use log(age) column in our classifier as it appears to be more normally distributed than Age (age is positively skewed)



## Countplots

In [None]:
fig, (ax1, ax2, ax3, ax4, ax5, ax6) = plt.subplots(1,6,figsize=(28,5))
fig.suptitle('Countplots')
plt.style.use('seaborn')

sns.countplot(data=clean_df,x='Gender',ax=ax1)
ax1.set(xlabel='Gender')

sns.countplot(data=clean_df,x='Tier',ax=ax2)
ax2.set(xlabel='Tier')

sns.countplot(data=clean_df,x='Martial_Status',ax=ax3)
ax3.set(xlabel='Martial_Status')

sns.countplot(data=clean_df,x='Financial_Status',ax=ax4)
ax4.set(xlabel='Financial_Status')

sns.countplot(data=clean_df,x='Cost_over_brand',ax=ax5)
ax5.set(xlabel='Cost over brand')

sns.countplot(data=clean_df,x='Avg_Money_spend',ax=ax6)
ax6.set(xlabel='Avg_Money_spend')

plt.xticks(rotation=30)
plt.show()

Inferences:
  1. Responses from Tier 2 cities are the most in our dataset 
  2. Student and Regular Job people have filled the survey the most 
  3. Cost over brand is almost equally distributed between Yes and No , that means brand may or may not be a big deal for most survey participants 
  4. People on an average spend between 1000-5000 Rupees while buying a product (we will analyse this more)

## Avg Money spend (subcateogry-wise)

In [None]:
# Lets check how much money do people spend product category wise 
df.groupby('Product_Category')['Avg_Money_spend'].agg(pd.Series.mode).to_dict()
# In most of cateogry people spend between 1000-5000 rupees in one time shop , but in books and OTT subs they spend less than 1000 
fig, ax = plt.subplots(figsize = (11,7))
sns.countplot(data=clean_df,x='Avg_Money_spend',hue='Product_Category',ax=ax)
ax.set_xlabel('Avg_Money_spend')
ax.set_ylabel('Freqeuncy')
ax.set_title('Avg Money spend in different Product Categories',fontdict={'fontsize':22})
plt.legend(loc='upper right')
plt.show();

Inferences:
 1. People spend the most money on Clothing and Electronics.
 2. Books are in most of the cases being bought for less than 1000 rupees.
 3. Clothing is bought by the customer across all the three price ranges. 
 4. No records of OTT subs and Books in greater than 5000 category. 

## Gender vs Product Cateogry 

In [None]:
fig, ax = plt.subplots(figsize = (11,7))
sns.countplot(data=clean_df,x='Product_Category',hue='Gender',ax=ax)
ax.set_xlabel('Product Categories')
ax.set_ylabel('Count')
ax.set_title('Gender VS Product Categories',fontdict={'fontsize':22})
labels=['Electronics','Fashion','Clothing','OTT','Books','Household goods']
ax.set_xticklabels(labels,rotation=30)
plt.legend(loc='upper right')

plt.show();

Inferences:
 1. Electronics cateogry is being dominated by Male customers 
 2. Similarly Fashion is being dominated by Female customers 
 3. Rest all categories appear to be balanced or we don't have enought data to make any assumptions 

## Tier of city VS Product Category

In [None]:
fig, ax = plt.subplots(figsize = (11,7))
sns.countplot(data=clean_df,x='Product_Category',hue='Tier',ax=ax)
ax.set_xlabel('Product Categories')
ax.set_ylabel('Count')
ax.set_title('City Tier VS Product Categories',fontdict={'fontsize':22})
labels=['Electronics','Fashion','Clothing','OTT','Books','Household goods']
ax.set_xticklabels(labels,rotation=30)
plt.legend(loc='upper right')

plt.show();

Inferences:
 1. Very few people are intrested in buying Fashion products from Tier-3 city
 2. Tier 1&2 dominates all categories specially clothing and electronics.
 3. Rest all categories appear to be balanced or we don't have enought data to make any assumptions 

## Most popular product among product category 

Most popular products -> 
1. Books -> Fiction books 
2. Clothing -> T-shirts/Shoes
3. Elecronics -> Headphones/Earphones 
4. OTT -> Amazon Prime & Disney Hotstar Subscription 

In [None]:
pd.DataFrame(clean_df.groupby(['Product_Category']).agg(pd.Series.mode))

Inference:
1.   Married People with regular job buy household goods with average cost below  ₹5000.
2.  Student tend to buy OTT subscriptions and Fashion products.
3. Most of the people take couple of weeks to decide which electronic product to buy. 
4. Books and Fashion products are bought mostly by females.
5. People buying electronics products are more brand conscious.
6. Average money spent on books and OTT subscriptions is less than  ₹1000 monthly.



## Shopping monthly Frequency

In [None]:
fig, ax = plt.subplots(figsize = (11,7))
sns.set_style("dark")
sns.boxplot(data=clean_df,y='Shopping_monthly_Freq',x='Product_Category',ax=ax)
ax.set_xlabel('Product Categories')
ax.set_ylabel('Shopping Monthly Frequency')
ax.set_title('Product Category VS Monthly shopping frequency',fontdict={'fontsize':22})
labels=['Electronics','Fashion','Clothing','OTT','Books','Household goods']
ax.set_xticklabels(labels,rotation=30)

plt.show();

Inferences:
 1. Mean shopping frequency is around 3
 2. Household goods appear to be the most freqeuntly bought item in a month
 3. Shopping frequency of electronics and books appear to be less than the others
 4. OTT subscriptions and Household items are the most frequently bought categories in a month.

# Data Transformation 

In [None]:
TIER_DICT={'Tier 1':1,'Tier 2':2,'Tier 3':3}
clean_df['Tier']=clean_df.Tier.map(TIER_DICT)

In [None]:
DUMMIES_COLS=['Gender','Martial_Status','Financial_Status','Cost_over_brand','Time_to_buy','Avg_Money_spend']
clean_df=pd.get_dummies(clean_df,columns=DUMMIES_COLS,drop_first=True)

In [None]:
clean_df['log_Age']=np.log(clean_df['Age'])

In [None]:
clean_df['State'].value_counts()

Divinding states into 4 cols as per below info: 
1. Region1 NORTH - (Chandigarh, Delhi, Haryana, Himanchal Pradesh, Jammu and Kashmir, Punjab, Rajasthan, Uttar Pradesh and Uttarakhand)

2. Region2 WEST - (Chhattisgarh, Dadar and Nagar, Diu and Daman, Goa, Gujarat, Madhya Pradesh, Maharashtra)

3. Region 3 SOUTH - (Andaman and Nicobar, Andhra Pradesh, Karnataka, Kerala, Pondicherry, Tamil Nadu, Telangana, Lakshadweep)

4. Region 4 EAST - (Arunachal Pradesh, Assam, Bihar, Jharkhand, Manipur, Meghalaya, Mizoram, Nagaland, Odisha, Sikkim, Tripura, West Bengal)

In [None]:
north_states = ['Chandigarh', 'Delhi', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Punjab', 'Rajasthan', 'Uttar Pradesh' , 'Uttarakhand']
west_states = ['Chhattisgarh', 'Dadra and Nagar Haveli and Daman and Diu', 'Goa', 'Gujarat', 'Madhya Pradesh', 'Maharashtra']
south_states = ['Andaman and Nicobar', 'Andhra Pradesh', 'Karnataka', 'Kerala', 'Pondicherry', 'Tamil Nadu', 'Telangana', 'Lakshadweep']
east_states =  ['Arunachal Pradesh', 'Assam', 'Bihar', 'Jharkhand', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Sikkim', 'Tripura', 'West Bengal']

regions=[]
for state in df['State'].values:
  if state in north_states:
    regions.append('North')
  elif state in west_states:
    regions.append('West')
  elif state in south_states:
    regions.append('South')
  elif state in east_states:
    regions.append('East')
  else:
    regions.append('Error')

clean_df['Regions']=regions

In [None]:
fig, ax = plt.subplots(figsize = (11,7))
sns.countplot(data=clean_df,x='Product_Category',hue='Regions',ax=ax)
ax.set_xlabel('Product Categories')
ax.set_ylabel('Count')
ax.set_title('State Regions VS Product Categories',fontdict={'fontsize':22})
labels=['Electronics','Fashion','Clothing','OTT','Books','Household goods']
ax.set_xticklabels(labels,rotation=30)
plt.legend(loc='upper right')

plt.show();

In [None]:
clean_df=pd.get_dummies(clean_df,columns=['Regions'],drop_first=True)
clean_df.drop('State',axis=1,inplace=True)

In [None]:
clean_df.head()

In [None]:
# Subproducts can be used to deduce which product is being demanded by the customers the most but it is not a useful column to predict the product category thus we drop it 
subproducts=df['Subproducts']
clean_df.drop('Subproducts',axis=1,inplace=True)

In [None]:
df_train = clean_df.drop(['Product_Category','Age'],axis=1)  # we will use only Age col
enc=LabelEncoder()
y_train = enc.fit_transform(clean_df[['Product_Category']])

In [None]:
df_train.head()

In [None]:
print(enc.classes_)
print(len(y_train))

In [None]:
# making a preprocessing function for test dataset 
def preprocess_data(train,df):
  test_size = len(df)
  df=pd.concat([train,df],axis=0)
  TIER_DICT={'Tier 1':1,'Tier 2':2,'Tier 3':3}
  df['Tier']=df.Tier.map(TIER_DICT)
  DUMMIES_COLS=['Gender','Martial_Status','Financial_Status','Cost_over_brand','Time_to_buy','Avg_Money_spend']
  df=pd.get_dummies(data=df,columns=['Gender','Martial_Status','Financial_Status','Cost_over_brand','Time_to_buy','Avg_Money_spend'],drop_first=True)
  df['log_Age']=np.log(df['Age'])
  df=df.drop('Age',axis=1)
  north_states = ['Chandigarh', 'Delhi', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Punjab', 'Rajasthan', 'Uttar Pradesh' , 'Uttarakhand']
  west_states = ['Chhattisgarh', 'Dadra and Nagar Haveli and Daman and Diu', 'Goa', 'Gujarat', 'Madhya Pradesh', 'Maharashtra']
  south_states = ['Andaman and Nicobar', 'Andhra Pradesh', 'Karnataka', 'Kerala', 'Pondicherry', 'Tamil Nadu', 'Telangana', 'Lakshadweep']
  east_states =  ['Arunachal Pradesh', 'Assam', 'Bihar', 'Jharkhand', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Sikkim', 'Tripura', 'West Bengal']

  regions=[]
  for state in df['State'].values:
    if state in north_states:
      regions.append('North')
    elif state in west_states:
      regions.append('West')
    elif state in south_states:
      regions.append('South')
    elif state in east_states:
      regions.append('East')
    else:
      regions.append('Error')
    
  df['Regions']=regions
  print(df['Regions'].value_counts())
  df=pd.get_dummies(data=df,columns=['Regions'],drop_first=True)
  df.drop('City',axis=1,inplace=True)
  df.drop('State',axis=1,inplace=True)
  df.drop('Subproducts',axis=1,inplace=True)
  return df.tail(test_size)

# Model Selection 

In [None]:
model_dict={}

### Sampling Data ( train-test split or K-fold cross validation ) 

In [None]:
X_train=df_train.values
over = RandomOverSampler(sampling_strategy='not majority')
# fit and apply the transform
X, y = over.fit_resample(X_train, y_train)

In [None]:
print(Counter(y))
print(X.shape,y.shape)

In [None]:
scaler=MinMaxScaler()
X_scaled=scaler.fit_transform(X)

In [None]:
SEED = 7
N_SPLITS = 10
kfold = KFold(n_splits= N_SPLITS, random_state=SEED,shuffle=True)

## Naive Bayes

In [None]:
model_nb=MultinomialNB(alpha=1)
results_nb = cross_val_score(model_nb, X, y, cv=kfold,scoring='f1_weighted')
print("10 splits K fold F1 Score: %.3f ± %.3f" % (results_nb.mean(), results_nb.std()))
model_dict['Naive Bayes']= results_nb.mean()

## Logistic Regression 

In [None]:
model_lr = LogisticRegression(multi_class='ovr')
results_lr = cross_val_score(model_lr, X_scaled, y, cv=kfold,scoring='f1_weighted')
print("10 splits K fold F1 Score: %.3f ± %.3f" % (results_lr.mean(), results_lr.std()))
model_dict['Logistic Regression']= results_lr.mean()

## Support Vector Machines

In [None]:
model_svc=SVC()
results_svc = cross_val_score(model_svc, X_scaled, y, cv=kfold,scoring='f1_weighted')
print("10 splits K fold F1 Score: %.3f ± %.3f" % (results_svc.mean(), results_svc.std()))
model_dict['Support Vector Machines']= results_svc.mean()

## K-Nearest Neighbour 

In [None]:
model_knn = KNeighborsClassifier(n_neighbors=7)
results_knn = cross_val_score(model_knn, X_scaled, y, cv=kfold,scoring='f1_weighted')
print("10 splits K fold F1 Score: %.3f ± %.3f" % (results_knn.mean(), results_knn.std()))
model_dict['Nearest_neighbour']= results_knn.mean()

## Decision Trees

In [None]:
model_dt = DecisionTreeClassifier(criterion='gini')
results_dt = cross_val_score(model_dt, X, y, cv=kfold,scoring='f1_weighted')
print("10 splits K fold F1 Score: %.3f ± %.3f" % (results_dt.mean(), results_dt.std()))
model_dict['Decision Trees']= results_dt.mean()

## Random Forest - A bagging technique on Tress

In [None]:
model_rf = RandomForestClassifier(n_estimators=100,criterion='gini')
results_rf = cross_val_score(model_rf,X,y,cv=kfold,scoring='f1_weighted')
print("10 splits K fold F1 Score: %.3f ± %.3f" % (results_rf.mean(), results_rf.std()))
model_dict['Random Forest']= results_rf.mean()

## LightGBM ( gradient Boosting )

In [None]:
model_gbm = LGBMClassifier(n_estimators=200)
results_gbm = cross_val_score(model_gbm,X,y,cv=kfold,scoring='f1_weighted')
print("10 splits K fold F1 Score: %.3f ± %.3f" % (results_gbm.mean(), results_gbm.std()))
model_dict['Gradient Boosting']= results_gbm.mean()

## Comparing Models

In [None]:
sns.set_theme(style="whitegrid")
sns.set_color_codes("pastel")
f, ax = plt.subplots(figsize=(15,6))

sns.barplot(x=list(model_dict.values()),y=list(model_dict.keys()))
ax.set(xlabel='F1_weighted Score',ylabel='Model Name',title='Comparing F1 score for different ML models')
plt.show()
# Random forest or gradient boosting seems to be the best model for our data

# Final Model building 

### Bulding Final Model - Random Forest 

In [None]:
SEED = 7
N_SPLITS = 10
kfold = KFold(n_splits= N_SPLITS, random_state=SEED,shuffle=True)
scores={}
num_trees=[50,100,150,200,250,300]
for nt in num_trees:
  model = RandomForestClassifier(n_estimators=nt)
  f1_scores=cross_val_score(model,X,y,scoring='f1_weighted')
  scores[nt]=f1_scores.mean()

In [None]:
fig, ax = plt.subplots(figsize = (11,7))
sns.lineplot(y=list(scores.values()),x=list(scores.keys()),ax=ax)
ax.set_xlabel('No.of Trees')
ax.set_ylabel('F1 Weighted Score')
ax.set_title('No of Tress Vs F1 Weighted Score',fontdict={'fontsize':22})
plt.show();

In [None]:
# Final model with No.of Trees = 100
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=21)
BEST_NUM_TREES = 100
final_model = RandomForestClassifier(n_estimators= BEST_NUM_TREES)
final_model.fit(X_train,y_train)

In [None]:
y_pred_train=final_model.predict(X_train)
y_pred_test=final_model.predict(X_test)

In [None]:
print("======== Final Report ================")
print("Train Score")
print("Precision:",precision_score(y_train,y_pred_train,average='weighted'))
print("Recall:",recall_score(y_train,y_pred_train,average='weighted'))
print("F1 Score:",f1_score(y_train,y_pred_train,average='weighted'))

print("######################################")

print("Test Score")
print("Precision:",precision_score(y_test,y_pred_test,average='weighted'))
print("Recall:",recall_score(y_test,y_pred_test,average='weighted'))
print("F1 Score:",f1_score(y_test,y_pred_test,average='weighted'))

print("================ END ==================")

### Classification report on test data

In [None]:
print(classification_report(y_test,y_pred_test,target_names=list(enc.classes_)))

### Predicting on new data 

In [None]:
df.head()

In [None]:
test_vector={ 
    'Age':[23],
    'Gender':['Female'],  
    'Tier': ['Tier 2'],
    'State': ['Uttar Pradesh'], 
    'City': ['Lucknow'], 
    'Martial_Status': ['Not Married'], 
    'Financial_Status': ['Student'], 
    'Cost_over_brand': ['No'],  
    'Time_to_buy': ['Couple of Weeks'], 
    'Shopping_monthly_Freq': [3], 
    'Avg_Money_spend': ['Between ₹1000-₹5000'], 
    'Subproducts':['Shoes'],
}
test_df=pd.DataFrame.from_dict(test_vector)

In [None]:
train_preprocess = df.drop(['Timestamp','Product_Category'],axis=1)
test=preprocess_data(train_preprocess,test_df)

In [None]:
print(test.shape)
print(test.columns)
print(df_train.shape)
print(df_train.columns)

In [None]:
X_test = test.values
product_num=final_model.predict(X_test)
display(test_df)
print()
print("Predicted Product Category : {0}".format(enc.inverse_transform(product_num)))

# The most impactful factor 

### Using statistical methods 

In [None]:
k_best=SelectKBest(score_func=f_classif,k=18)
fit = k_best.fit(X,y)
# summarize scores
print(fit.scores_)

In [None]:
sns.set_theme(style="dark")
sns.set_color_codes("dark")
f, ax = plt.subplots(figsize=(15,8))

sns.barplot(x=list(fit.scores_),y=list(df_train.columns))
ax.set(xlabel='Statistical Feature Imp score',ylabel='Feature Name',title='Which Feature has the most impact?')
plt.show()

### Using Our final trained Random Forest

In [None]:
sns.set_theme(style="whitegrid")
sns.set_color_codes("pastel")
f, ax = plt.subplots(figsize=(15,8))

sns.barplot(x=list(final_model.feature_importances_),y=list(df_train.columns))
ax.set(xlabel='Random Forest Feature Imp score',ylabel='Feature Name',title='Which Feature has the most impact?')
plt.show()

As per our **Random Forest Model** , *Top 3 most impactful columns* are:-


1.   Age 
2.   Shopping monthly frequency 
3.   City Tier 



As per our **Statistical methods** , *Top 4 most impactful columns* are:-


1.   Shopping monthly frequency 
2.   Martial Status 
3.   Avg Money spend 
4.   Age & Gender

################################# END #################################