# Exploratory Data Analysis Project : Hotel Data set

In [1]:
import pandas as pd 
import numpy as np 
import plotly.express as px
import plotly.graph_objs as go

%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
#from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

#!pip install sort-dataframeby-monthorweek
#!pip install sorted-months-weekdays

# Model Building (Cancellation prediction)

## Data Cleaning

In [242]:
# reading data
df = pd.read_csv('./input/hotel_bookings.csv')
pd.set_option('display.max_columns',None)
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [243]:
df.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [244]:
df  = df.dropna(axis=1)

In [245]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0,BB,Direct,Direct,0,0,0,C,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0,BB,Direct,Direct,0,0,0,C,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0,BB,Direct,Direct,0,0,0,A,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0,BB,Corporate,Corporate,0,0,0,A,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0,BB,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [246]:
df.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
babies                            0
meal                              0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
reservation_status                0
reservation_status_date     

In [247]:
df.shape

(119390, 28)

In [248]:
df.corr()['is_canceled']

is_canceled                       1.000000
lead_time                         0.293123
arrival_date_year                 0.016660
arrival_date_week_number          0.008148
arrival_date_day_of_month        -0.006130
stays_in_weekend_nights          -0.001791
stays_in_week_nights              0.024765
adults                            0.060017
babies                           -0.032491
is_repeated_guest                -0.084793
previous_cancellations            0.110133
previous_bookings_not_canceled   -0.057358
booking_changes                  -0.144381
days_in_waiting_list              0.054186
adr                               0.047557
required_car_parking_spaces      -0.195498
total_of_special_requests        -0.234658
Name: is_canceled, dtype: float64

In [249]:
df.drop(columns=['arrival_date_day_of_month','stays_in_weekend_nights','babies','is_repeated_guest','previous_bookings_not_canceled','booking_changes','required_car_parking_spaces','total_of_special_requests'],inplace=True)

df['reservation_status_date'] = pd.DatetimeIndex(df['reservation_status_date'])

In [250]:
df.dtypes

hotel                               object
is_canceled                          int64
lead_time                            int64
arrival_date_year                    int64
arrival_date_month                  object
arrival_date_week_number             int64
stays_in_week_nights                 int64
adults                               int64
meal                                object
market_segment                      object
distribution_channel                object
previous_cancellations               int64
reserved_room_type                  object
assigned_room_type                  object
deposit_type                        object
days_in_waiting_list                 int64
customer_type                       object
adr                                float64
reservation_status                  object
reservation_status_date     datetime64[ns]
dtype: object

In [251]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_week_nights,adults,meal,market_segment,distribution_channel,previous_cancellations,reserved_room_type,assigned_room_type,deposit_type,days_in_waiting_list,customer_type,adr,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,0,2,BB,Direct,Direct,0,C,C,No Deposit,0,Transient,0.0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,0,2,BB,Direct,Direct,0,C,C,No Deposit,0,Transient,0.0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,1,BB,Direct,Direct,0,A,C,No Deposit,0,Transient,75.0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,1,BB,Corporate,Corporate,0,A,A,No Deposit,0,Transient,75.0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,2,2,BB,Online TA,TA/TO,0,A,A,No Deposit,0,Transient,98.0,Check-Out,2015-07-03


In [252]:
for col in df.select_dtypes(include='object').columns:
    print('\n''---'+col+'---')
    a=df[col].unique()
    print(a)


---hotel---
['Resort Hotel' 'City Hotel']

---arrival_date_month---
['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June']

---meal---
['BB' 'FB' 'HB' 'SC' 'Undefined']

---market_segment---
['Direct' 'Corporate' 'Online TA' 'Offline TA/TO' 'Complementary' 'Groups'
 'Undefined' 'Aviation']

---distribution_channel---
['Direct' 'Corporate' 'TA/TO' 'Undefined' 'GDS']

---reserved_room_type---
['C' 'A' 'D' 'E' 'G' 'F' 'H' 'L' 'P' 'B']

---assigned_room_type---
['C' 'A' 'D' 'E' 'G' 'F' 'I' 'B' 'H' 'P' 'L' 'K']

---deposit_type---
['No Deposit' 'Refundable' 'Non Refund']

---customer_type---
['Transient' 'Contract' 'Transient-Party' 'Group']

---reservation_status---
['Check-Out' 'Canceled' 'No-Show']


In [253]:
hotel =  pd.get_dummies(df.hotel)
df = pd.concat([df,hotel],axis='columns')

hotel2 =  pd.get_dummies(df.arrival_date_month)
df = pd.concat([df,hotel2],axis='columns')

hotel3 =  pd.get_dummies(df.meal)
df = pd.concat([df,hotel3],axis='columns')

hotel4 =  pd.get_dummies(df.market_segment)
df = pd.concat([df,hotel4],axis='columns')

hotel5 =  pd.get_dummies(df.distribution_channel)
df = pd.concat([df,hotel5],axis='columns')

hotel6 =  pd.get_dummies(df.reserved_room_type)
df = pd.concat([df,hotel6],axis='columns')

hotel7 =  pd.get_dummies(df.assigned_room_type)
df = pd.concat([df,hotel7],axis='columns')

hotel8 =  pd.get_dummies(df.deposit_type)
df = pd.concat([df,hotel8],axis='columns')

hotel9 =  pd.get_dummies(df.customer_type)
df = pd.concat([df,hotel9],axis='columns')

hotel10 =  pd.get_dummies(df.reservation_status)
df = pd.concat([df,hotel10],axis='columns')




In [254]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_week_nights,adults,meal,market_segment,distribution_channel,previous_cancellations,reserved_room_type,assigned_room_type,deposit_type,days_in_waiting_list,customer_type,adr,reservation_status,reservation_status_date,City Hotel,Resort Hotel,April,August,December,February,January,July,June,March,May,November,October,September,BB,FB,HB,SC,Undefined,Aviation,Complementary,Corporate,Direct,Groups,Offline TA/TO,Online TA,Undefined.1,Corporate.1,Direct.1,GDS,TA/TO,Undefined.2,A,B,C,D,E,F,G,H,L,P,A.1,B.1,C.1,D.1,E.1,F.1,G.1,H.1,I,K,L.1,P.1,No Deposit,Non Refund,Refundable,Contract,Group,Transient,Transient-Party,Canceled,Check-Out,No-Show
0,Resort Hotel,0,342,2015,July,27,0,2,BB,Direct,Direct,0,C,C,No Deposit,0,Transient,0.0,Check-Out,2015-07-01,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
1,Resort Hotel,0,737,2015,July,27,0,2,BB,Direct,Direct,0,C,C,No Deposit,0,Transient,0.0,Check-Out,2015-07-01,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
2,Resort Hotel,0,7,2015,July,27,1,1,BB,Direct,Direct,0,A,C,No Deposit,0,Transient,75.0,Check-Out,2015-07-02,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
3,Resort Hotel,0,13,2015,July,27,1,1,BB,Corporate,Corporate,0,A,A,No Deposit,0,Transient,75.0,Check-Out,2015-07-02,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
4,Resort Hotel,0,14,2015,July,27,2,2,BB,Online TA,TA/TO,0,A,A,No Deposit,0,Transient,98.0,Check-Out,2015-07-03,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0


In [255]:
df.select_dtypes(include='object').columns

Index(['hotel', 'arrival_date_month', 'meal', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'deposit_type', 'customer_type', 'reservation_status'],
      dtype='object')

In [256]:
df.drop(columns=['hotel', 'arrival_date_month', 'meal', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'deposit_type', 'customer_type', 'reservation_status',],inplace=True)

In [257]:
df

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,stays_in_week_nights,adults,previous_cancellations,days_in_waiting_list,adr,reservation_status_date,City Hotel,Resort Hotel,April,August,December,February,January,July,June,March,May,November,October,September,BB,FB,HB,SC,Undefined,Aviation,Complementary,Corporate,Direct,Groups,Offline TA/TO,Online TA,Undefined.1,Corporate.1,Direct.1,GDS,TA/TO,Undefined.2,A,B,C,D,E,F,G,H,L,P,A.1,B.1,C.1,D.1,E.1,F.1,G.1,H.1,I,K,L.1,P.1,No Deposit,Non Refund,Refundable,Contract,Group,Transient,Transient-Party,Canceled,Check-Out,No-Show
0,0,342,2015,27,0,2,0,0,0.00,2015-07-01,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
1,0,737,2015,27,0,2,0,0,0.00,2015-07-01,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
2,0,7,2015,27,1,1,0,0,75.00,2015-07-02,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
3,0,13,2015,27,1,1,0,0,75.00,2015-07-02,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
4,0,14,2015,27,2,2,0,0,98.00,2015-07-03,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2017,35,5,2,0,0,96.14,2017-09-06,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
119386,0,102,2017,35,5,3,0,0,225.43,2017-09-07,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
119387,0,34,2017,35,5,2,0,0,157.71,2017-09-07,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
119388,0,109,2017,35,5,2,0,0,104.40,2017-09-07,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0


In [258]:
df_x = df.drop(['is_canceled','reservation_status_date'],axis=1)
df_y = df['is_canceled']

In [259]:
df_x['lead_time'] = np.log(df_x['lead_time'] + 1)
df_x['arrival_date_year'] = np.log(df_x['arrival_date_year'] + 1)
df_x['arrival_date_week_number'] = np.log(df_x['arrival_date_week_number'] + 1)
df_x['days_in_waiting_list'] = np.log(df_x['days_in_waiting_list'] + 1)
df_x['adr'] = np.log(df_x['adr'] + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x['lead_time'] = np.log(df_x['lead_time'] + 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x['arrival_date_year'] = np.log(df_x['arrival_date_year'] + 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x['arrival_date_week_number'] = np.log(df_x['arrival_date_week_number'] + 1)
A value i

In [260]:
df_x['adr']=df_x['adr'].fillna(value=df['adr'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x['adr']=df_x['adr'].fillna(value=df['adr'].mean())


## MOdel Fitting


In [273]:

X_train,X_test,y_train,y_test = train_test_split(df_x,df_y,test_size=0.30)

In [274]:
#Logestic REgression

lr = LogisticRegression()
lr.fit(X_train.values, y_train.values)

y_pred_lr = lr.predict(X_test.values)

acc_lr = accuracy_score(y_test, y_pred_lr)
conf = confusion_matrix(y_test, y_pred_lr)
clf_report = classification_report(y_test, y_pred_lr)

print(f"Accuracy Score of Logistic Regression is : {acc_lr}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")


Accuracy Score of Logistic Regression is : 1.0
Confusion Matrix : 
[[22562     0]
 [    0 13255]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22562
           1       1.00      1.00      1.00     13255

    accuracy                           1.00     35817
   macro avg       1.00      1.00      1.00     35817
weighted avg       1.00      1.00      1.00     35817

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [276]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

acc_knn = accuracy_score(y_test, y_pred_knn)
conf = confusion_matrix(y_test, y_pred_knn)
clf_report = classification_report(y_test, y_pred_knn)

print(f"Accuracy Score of KNN is : {acc_knn}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

Accuracy Score of KNN is : 0.9852583968506575
Confusion Matrix : 
[[22526    36]
 [  492 12763]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     22562
           1       1.00      0.96      0.98     13255

    accuracy                           0.99     35817
   macro avg       0.99      0.98      0.98     35817
weighted avg       0.99      0.99      0.99     35817



In [268]:
%%time 
rd_clf = RandomForestClassifier()
rd_clf.fit(X_train, y_train)

y_pred_rd_clf = rd_clf.predict(X_test)

acc_rd_clf = accuracy_score(y_test, y_pred_rd_clf)
conf = confusion_matrix(y_test, y_pred_rd_clf)
clf_report = classification_report(y_test, y_pred_rd_clf)

print(f"Accuracy Score of Random Forest is : {acc_rd_clf}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

Accuracy Score of Random Forest is : 1.0
Confusion Matrix : 
[[14918     0]
 [    0  8960]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14918
           1       1.00      1.00      1.00      8960

    accuracy                           1.00     23878
   macro avg       1.00      1.00      1.00     23878
weighted avg       1.00      1.00      1.00     23878

Wall time: 5.46 s


In [269]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred_dtc = dtc.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
conf = confusion_matrix(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)

print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

Accuracy Score of Decision Tree is : 1.0
Confusion Matrix : 
[[14918     0]
 [    0  8960]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14918
           1       1.00      1.00      1.00      8960

    accuracy                           1.00     23878
   macro avg       1.00      1.00      1.00     23878
weighted avg       1.00      1.00      1.00     23878



In [279]:
models = pd.DataFrame({
    'Model' : ['Logistic Regression', 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier'],
    'Score' : [acc_lr, acc_knn, acc_dtc, acc_rd_clf]
})


models.sort_values(by = 'Score', ascending = False)

Unnamed: 0,Model,Score
0,Logistic Regression,1.0
2,Decision Tree Classifier,1.0
3,Random Forest Classifier,1.0
1,KNN,0.985258
