## Hotel Cancellation Prediction

## Import packages

In [None]:
#Import python libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import svm #Import svm model
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import confusion_matrix #Calculate confusion matrix
from xgboost import XGBClassifier

#Visualization packages
import matplotlib.pyplot as plt 
import seaborn as sns 

#Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## Functions

**Function to Train and Test Machine Learning Model**

In [None]:
#Function to Train and Test Machine Learning Model
def train_test_ml_model(X_train,y_train,X_test,Model):
    model.fit(X_train,y_train) #Train the Model
    y_pred = model.predict(X_test) #Use the Model for prediction

    # Test the Model
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test,y_pred)
    accuracy = round(100*np.trace(cm)/np.sum(cm),1)

    #Plot/Display the results
    cm_plot(cm,Model)
    print('Accuracy of the Model' ,Model, str(accuracy)+'%')

**Function to plot Confusion Matrix**

In [None]:
#Function to plot Confusion Matrix
def cm_plot(cm,Model):
    plt.clf()
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
    classNames = ['Negative','Positive']
    plt.title('Comparison of Prediction Result for '+ Model)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=45)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]
    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
    plt.show()
    print(cm[1][0])

## Import data

In [None]:
data = pd.read_csv("../input/scorpio-price-data/Scorpio_data.csv")
data.head()

## Display basic statistics

**Display Information (info) about the 'data'**

In [None]:
data.info()

<font color='blue'>**COMMENTS :** There is no missing data (all the columns have 19,905 rows)</font>

**Display basic statistics (description) about the 'data' - numerical**

In [None]:
data.describe()

**Display basic statistics (description) about the 'data' - categorical**

In [None]:
data.select_dtypes(include=['object']).describe(include='all')

<font color='blue'>**OBSERVATION :** We need to convert above categorical columns to numerical columns</font>

## Convert categorical columns to numerical columns

**Convert categorical columns of 'data' to numerical columns**

In [None]:
data=pd.get_dummies(data)

In [None]:
data.head()

<font color='blue'>**COMMENTS :** It can be observed additional columns are formed after converting from categorical to numerical columns</font>

In [None]:
data.shape

## Remove 'User ID' column

In [None]:
data = data.drop('User ID',axis = 1) 

<font color='blue'>**COMMENTS :** It can be observed that original 29 columns have now become 1068 columns after converting from categorical to numerical columns</font>

## Create feature and target set

**Create Target set(y) with 'Purchased' column**

In [None]:
#Create a training set by dropping target column
X = data.drop('Purchased',axis = 1) 

#Create the target set (output)
y = data.Purchased

In [None]:
X.head()

## Scaling the data values to standardize the range of independent variables

**Normalize the feature set X**

In [None]:
#Feature scaling is a method used to standardize the range of independent variables or features of data.
#Since the range of values of raw data varies widely, in some machine learning algorithms, objective functions will not work properly without normalization. 
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X = scale.fit_transform(X)

In [None]:
print(X)

<font color='blue'>**COMMENTS :** It can be observed that range of values is normalized</font>

## Split the data into "train" and "test" set

**Split the Feature set (X) and Target set (y) into training set (X_train, y_train) and testing set (X_test,y_test)**

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test
print("X_train size ==>", X_train.shape)
print("X_test size ==>", X_test.shape)

<font color='blue'>**COMMENTS :** Total data (19,905 rows) is divided into Training set(13,933 rows) and Testing set (5,972 rows)</font>

## <font color='blue'>Support Vector Machine (SVM) ML model</font>

In [None]:
from sklearn.svm import SVC,NuSVC  #Import packages related to Model
Model = "SVC"
model=SVC() #Create the Model

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>XG Boost Classifer ML model</font>

In [None]:
from xgboost import XGBClassifier  #Import packages related to Model
Model = "XGBClassifier()"
model=XGBClassifier() #Create the Model

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Gaussian Naive Bayes ML model</font>

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB  #Import packages related to Model
Model = "GaussianNB"
model=GaussianNB()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>SGD Classifier ML model</font>

In [None]:
from sklearn.linear_model import SGDClassifier #Import packages related to Model
Model = "SGDClassifier"
model=SGDClassifier()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Logistic Regression) ML model</font>

In [None]:
from sklearn.linear_model import LogisticRegression #Import packages related to Model
Model = "LogisticRegression"
model=LogisticRegression()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Decision Tree Classifier ML model</font>

In [None]:
from sklearn.tree import DecisionTreeClassifier #Import packages related to Model
Model = "DecisionTreeClassifier"
model=DecisionTreeClassifier()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Extra Tree Classifier ML model</font>

In [None]:
from sklearn.tree import ExtraTreeClassifier #Import packages related to Model
Model = "ExtraTreeClassifier"
model=ExtraTreeClassifier()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Quadratic Discriminant Analysis ML model</font>

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #Import packages related to Model
Model = "QuadraticDiscriminantAnalysis"
model = QuadraticDiscriminantAnalysis()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Liner Discriminant Analysis ML model</font>

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #Import packages related to Model
Model = "LinearDiscriminantAnalysis"
model=LinearDiscriminantAnalysis()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Random Forest Classifier ML model</font>

In [None]:
from sklearn.ensemble import RandomForestClassifier #Import packages related to Model
Model = "RandomForestClassifier"
model=RandomForestClassifier()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Ada Boost Classifier ML model</font>

In [None]:
from sklearn.ensemble import AdaBoostClassifier #Import packages related to Model
Model = "AdaBoostClassifier"
model=AdaBoostClassifier()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>Gradient Boosting Classifier ML model</font>

In [None]:
from sklearn.ensemble import GradientBoostingClassifier #Import packages related to Model
Model = "GradientBoostingClassifier"
model=GradientBoostingClassifier()

train_test_ml_model(X_train,y_train,X_test,Model)

## <font color='blue'>KNeighours Classifier Classifier ML model</font>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
Model = "KNeighborsClassifier"
model=KNeighborsClassifier(7)

train_test_ml_model(X_train,y_train,X_test,Model)