In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Dataset and Preparation

In [None]:
airline = pd.read_csv("/kaggle/input/airline-passenger-satisfaction/train.csv",index_col=0)
test = pd.read_csv("/kaggle/input/airline-passenger-satisfaction/test.csv",index_col=0)
airline.head()

In [None]:
airline.head()

In [None]:
def conv_cat(data):
    data['Customer Type'] = data['Customer Type'].map({'Loyal Customer': 1, 'disloyal Customer': 0})
    data['Type of Travel'] = data['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})
    data['Class'] = data['Class'].map({'Business':2, 'Eco Plus':1, 'Eco':0})
    data['satisfaction'] = data['satisfaction'].map({'neutral or dissatisfied':0, 'satisfied':1})
    data = pd.get_dummies(data)
    data.drop('id',axis=1,inplace=True)
    data.fillna(0,inplace=True)
    return data
airline = conv_cat(airline)
airline_test = conv_cat(test)

In [None]:
airline.head()

In [None]:
X = airline.drop("satisfaction",axis=1)
y = airline["satisfaction"]
from sklearn.model_selection import train_test_split
np.random.seed(42)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Fitting Model and Scoring Parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [None]:
y_pred = clf.predict(X_test)
y_pred[:20]

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
clf.predict_proba(X_test[:20])

# Cross Validation Score and ROC Curve

In [None]:
from sklearn.model_selection import cross_val_score
np.random.seed(42)
cross_val_score(clf,X,y,cv=6)

In [None]:
np.random.seed(42)
clf_single_score = clf.score(X_test,y_test)
clf_crossval_score = np.mean(cross_val_score(clf,X,y,cv=6))
pd.DataFrame([{"Classification Single Score":clf_single_score,"Cross Validation Score":clf_crossval_score}])

In [None]:
from sklearn.metrics import roc_curve
y_prob = clf.predict_proba(X_test)
y_positive = y_prob[:,1]

#Calculate false positive rate, true positive rate and thresholds
fpr,tpr,thresholds = roc_curve(y_test,y_positive)
fpr

In [None]:
import matplotlib.pyplot as plt
def plot_roc(fpr,tpr):
    plt.plot(fpr,tpr,color='orange',label='ROC')
    plt.plot([0,1],[0,1],color='darkblue',linestyle='--',label="Guessing")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristics Curve (ROC)")
    plt.legend()
    plt.show()
    
plot_roc(fpr,tpr)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_positive)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

In [None]:
pd.crosstab(y_test,y_pred,
           rownames=["Actual Label"],
           colnames=["Predicted Label"])

In [None]:
import seaborn as sns
sns.set(font_scale=1.5)
conf_mat = confusion_matrix(y_test,y_pred)
sns.heatmap(conf_mat);

In [None]:

def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(conf_mat,
                     annot=True, # Annotate the boxes 
                     cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label');

plot_conf_mat(conf_mat)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(clf, X, y)

# Classification Report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
np.random.seed(42)
#Default- Mean Accuracy
cv_acc = cross_val_score(clf,X,y,cv=5)
#Cross Validated Score
print(f"The Cross Validated Accuracy : {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
#Accuracy Param
cv_acc = cross_val_score(clf,X,y,cv=5,scoring="accuracy")
#Cross Validated Score
print(f"The Cross Validated Accuracy : {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
#Precision Param
cv_acc = cross_val_score(clf,X,y,cv=5,scoring="precision")
print(f"The Cross Validated Precision : {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
#Recall Param
cv_acc = cross_val_score(clf,X,y,cv=5,scoring="recall")
print(f"The Cross Validated Recall : {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
#F1 Param
cv_acc = cross_val_score(clf,X,y,cv=5,scoring="f1")
print(f"The Cross Validated F1 score : {np.mean(cv_acc)*100:.2f}%")

# Classification Metrics

In [None]:
def classification_metrics(y_test,y_pred):
    from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
    print("Classification Metrics: ")
    print(f"Accuracy: {accuracy_score(y_test,y_pred)*100 :.2f}%")
    print(f"Precision: {precision_score(y_test,y_pred)*100 :.2f}%")
    print(f"Recall: {recall_score(y_test,y_pred)*100 :.2f}%")
    print(f"F1: {f1_score(y_test,y_pred)*100 :.2f}%")
    metric_dict = {"accuracy": round(accuracy_score(y_test,y_pred), 2),
                   "precision": round(precision_score(y_test,y_pred), 2), 
                   "recall": round(recall_score(y_test,y_pred), 2),
                   "f1": round(f1_score(y_test,y_pred), 2)}
    return metric_dict
base_metrics = classification_metrics(y_test,y_pred)

In [None]:
clf.get_params()

# RandomizedSearchCV for finding best Parameter

In [None]:
grid = {"n_estimators":[10,100,200,500,1000,1200],
       "max_depth":[None,5,10,20,30],
       "max_features":["auto","sqrt"],
       "min_samples_split":[2,4,6],
       "min_samples_leaf":[1,2,4]}

In [None]:
np.random.seed(42)
clf= RandomForestClassifier(n_jobs=1)
from sklearn.model_selection import RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                   param_distributions=grid,
                   n_iter=20,
                   cv=5,
                   verbose=2)
rs_clf.fit(X_train,y_train);

In [None]:
rs_clf.best_params_

In [None]:
X_testdata = airline_test.drop("satisfaction",axis=1)
y_testdata = airline_test["satisfaction"]

# Prediction on given test data

In [None]:
rs_y_preds = rs_clf.predict(X_testdata)
rs_metrics= classification_metrics(y_testdata,rs_y_preds)

In [None]:
compare_metrics = pd.DataFrame({"baseline": base_metrics,
                                "random search": rs_metrics})
compare_metrics.plot.bar(figsize=(10, 8));