## Importing Libraries

In [None]:
import numpy as np
import pandas as pd

## Importing dataset

In [None]:
dataset = pd.read_csv('../../data/data-6-model.csv')
X = dataset.iloc[:, :-3].values
y = dataset.iloc[:, -3].values
print(X[0])
print(y[0])
dataset.head()

## Encode Independant variables

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output = False), [1, 2, 3, 5])], remainder='passthrough')
X = ct.fit_transform(X)
print(X[0])

## Encode dependant variables

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y[0])

## Splitting dataset into Test set and Train set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 20:] = sc.fit_transform(X_train[:, 20:])
X_test[:, 20:] = sc.transform(X_test[:, 20:])
print(X_train[0])

## Train XGBoost model on the Train set

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

## Predict the Test set results

In [None]:
y_pred = classifier.predict(X_test)

## Create Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

## Applying K-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy : {:.2f}".format(accuracies.mean()*100))
print("Standard Deviation : {:.2f}".format(accuracies.std()*100))

## Applying GridSearch to find the best hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'max_depth': [1, 2, 3, 4, 5, 6],
              'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
              'learning_rate': [0.5, 0.6, 0.7, 0.8, 0.9, 1]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy : {:.2f}".format(best_accuracy*100))
print("Best parameters : ", best_parameters)