In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter
import plotly.express as px 


In [None]:
data = pd.read_csv("../input/customer-analytics/Train.csv")

In [None]:
data.head()

In [None]:
Counter(data['Reached.on.Time_Y.N'])

In [None]:
data['Reached.on.Time_Y.N'] = data['Reached.on.Time_Y.N'].map({0:'Yes',1:'No'})

In [None]:
data.head()

In [None]:
Counter(data['Reached.on.Time_Y.N'])

In [None]:
data.info()

In [None]:
data.isnull().sum()

#### Checking for Balanced Data set

In [None]:
data.groupby(['Reached.on.Time_Y.N'])['Reached.on.Time_Y.N'].count().plot.bar()

In [None]:
data.head()

In [None]:
sns.catplot(x='Mode_of_Shipment',y='Cost_of_the_Product',hue='Gender',col='Reached.on.Time_Y.N',data=data,kind='bar')

In [None]:
sns.catplot(x='Mode_of_Shipment',hue='Gender',col='Reached.on.Time_Y.N',data=data,kind='count')

In [None]:
data.head()

In [None]:
sns.catplot(x='Warehouse_block',hue='Gender',col='Reached.on.Time_Y.N',data=data,kind='count')

In [None]:
sns.catplot(x='Mode_of_Shipment',col='Reached.on.Time_Y.N',data=data,kind='count')

In [None]:
reached = data['Reached.on.Time_Y.N'].value_counts().reset_index()
reached.columns = ['Time','Values']
px.pie(reached,names='Time',values='Values',color_discrete_sequence=px.colors.sequential.Darkmint_r)

In [None]:
sns.distplot(data['Cost_of_the_Product'],kde=True)

In [None]:
xs=list(data['Mode_of_Shipment'].unique())
print(xs)

In [None]:
data.groupby(['Mode_of_Shipment'])['Weight_in_gms'].count().plot.bar()

In [None]:
data.head()

In [None]:
plt.figure(figsize=(40,20))
sns.pairplot(data=data,hue='Reached.on.Time_Y.N')
plt.show()

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(data.corr(),annot=True,vmin=0,vmax=1,linewidths=3,linecolor='blue')

In [None]:
feat = set()
corr_data = data.corr()
for i in range(len(corr_data.columns)):
    for j in range(i):
        if corr_data.iloc[i,j] >= 0.05:
            colname = corr_data.columns[i]
            feat.add(colname)
print(feat)

In [None]:
data.head()

In [None]:
data['Reached.on.Time_Y.N'] = data['Reached.on.Time_Y.N'].map({'Yes':0,'No':1})

In [None]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [None]:
X = pd.get_dummies(X,drop_first=True)

In [None]:
X.head()

In [None]:
LE = StandardScaler()


In [None]:
X = pd.DataFrame(LE.fit_transform(X),columns=X.columns)

In [None]:
X.head()

In [None]:
from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X,y)
mutal_data = pd.Series(mutual_info, index = X.columns)
mutal_data.sort_values(ascending=True)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
ranked_feat = pd.Series(model.feature_importances_,index=X.columns)
ranked_feat.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
X.drop(columns=['ID'],axis=1,inplace=True)

In [None]:
XX_train,X_test,yy_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train,X_cv,y_train,y_cv = train_test_split(XX_train,yy_train,test_size=0.2,random_state=42)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
RF = RandomForestClassifier()
RF_param = {'n_estimators':range(1,500,10), 'max_depth':range(1,50)}
RF_search = RandomizedSearchCV(RF,RF_param,scoring='neg_mean_squared_error',cv=6,n_jobs=-1)
RF_search.fit(X_cv,y_cv)

In [None]:
print(RF_search.best_params_)
print(RF_search.best_score_)
print(RF_search.best_estimator_)


In [None]:
RF_result = RandomForestClassifier(n_estimators=11,max_depth=15)
RF_result.fit(X_train,y_train)
pred1 = RF_result.predict(X_test)

In [None]:
Acc1 = accuracy_score(y_test,pred1)

In [None]:
print(Acc1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
KN = KNeighborsClassifier()
param_grid = {'n_neighbors':range(1,20)}
grid = GridSearchCV(KN,param_grid,cv=10,scoring='neg_mean_squared_error')

In [None]:
grid.fit(X_cv,y_cv)

In [None]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
KNN = KNeighborsClassifier(n_neighbors=13)
KNN.fit(X_train,y_train)

In [None]:
pred2 = KNN.predict(X_test)
Acc2 = accuracy_score(y_test,pred2)
print(accuracy_score(y_test,pred2))

In [None]:
param_deci = {'criterion':['gini','entropy'],
             'max_depth':range(1,10),
             }

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
grid_s = GridSearchCV(DT,param_deci,cv=10,n_jobs=-1)
grid_s.fit(X_cv,y_cv)

In [None]:
print(grid_s.best_params_)
print(grid_s.best_score_)
print(grid_s.best_estimator_)

In [None]:
DTT = DecisionTreeClassifier(max_depth=3,criterion='gini')

In [None]:
DTT.fit(X_train,y_train)

In [None]:
pred3=DTT.predict(X_test)

In [None]:
Acc3=accuracy_score(y_test,pred3)
print(accuracy_score(y_test,pred3))

In [None]:
Result = pd.DataFrame({'Algo':['RandomForest','KNN','DecisionTree'],
                      'Accuracy':[Acc1,Acc2,Acc3]})

In [None]:
Result

In [None]:
sns.catplot(x='Algo',y='Accuracy',data=Result,kind='bar')