In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import auc

In [None]:
train_data = pd.read_csv("/content/train_dataset.csv", index_col=0)
test_data = pd.read_csv("/content/test_dataset.csv", index_col=0)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.isnull().sum().sum()/len(train_data) * 100

In [None]:
test_data.isnull().sum().sum()/len(test_data) * 100

In [None]:
train_data['Arrival Delay in Minutes'] = train_data['Arrival Delay in Minutes'].fillna(method="bfill")
test_data['Arrival Delay in Minutes'] = test_data['Arrival Delay in Minutes'].fillna(method="bfill")

In [None]:
train_data.describe()

In [None]:
train_data.corrwith(train_data['satisfaction']).abs().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(6, 4))
plt.pie(train_data.satisfaction.value_counts(), 
        labels = ["Norozi", "Mamnun"], 
        autopct = '%1.1f%%')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=train_data, x="Online boarding")
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=train_data, x='Online boarding', hue='satisfaction', palette='Set2')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.kdeplot(data=train_data, x='Online boarding', shade=True)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(train_data, test_size=0.2, random_state=42)

X_train = train_set.drop("satisfaction", axis=1)
y_train = train_set["satisfaction"].copy()

X_test = test_set.drop("satisfaction", axis=1)
y_test = test_set["satisfaction"].copy()

In [None]:
category_cols = ['Gender','Customer Type','Type of Travel','Class']
number_cols = ['Age', 'Flight Distance', 'Inflight wifi service',
            'Departure/Arrival time convenient', 'Ease of Online booking',
            'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
            'Inflight entertainment', 'On-board service', 'Leg room service',
            'Baggage handling', 'Checkin service', 'Inflight service',
            'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']

In [None]:
num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='mean')),
          ('std_scaler', StandardScaler())             
])

cat_pipeline = Pipeline([
        ('one_hot', OneHotEncoder())
])

In [None]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, number_cols),
    ('cat', cat_pipeline, category_cols)
])

In [None]:
X_prepared = full_pipeline.fit_transform(X_train)
Y_test_prepared = full_pipeline.fit_transform(X_test)

In [None]:
print(X_prepared)

In [None]:
# LogisticRegression model TRAIN
LR_model = LogisticRegression()
LR_model.fit(X_prepared, y_train)

In [None]:
# Modelni baholaymiz
y_pred = LR_model.predict(Y_test_prepared)

print(classification_report(y_test, y_pred))
print("Ko'rsatkichi:", accuracy_score(y_test,y_pred))

## ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

# Confusion matrix
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="g")
plt.show()

In [None]:
# DecisionTreeClassifier model TRAIN
Tree_model = DecisionTreeClassifier(max_depth=10)
Tree_model.fit(X_prepared, y_train)

In [None]:
# Modelni baholaymiz
y_pred = Tree_model.predict(Y_test_prepared)

print(classification_report(y_test, y_pred))
print("Ko'rsatkichi:", accuracy_score(y_test,y_pred))

## ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

# Confusion matrix
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="g")
plt.show()

In [None]:
# SVC model TRAIN
SVC_model = SVC()
SVC_model.fit(X_prepared, y_train)

In [None]:
# Modelni baholaymiz
y_pred = SVC_model.predict(Y_test_prepared)

print(classification_report(y_test, y_pred))
print("Ko'rsatkichi:", accuracy_score(y_test,y_pred))

## ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

# Confusion matrix
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="g")
plt.show()

In [None]:
# RandomForestClassifier model TRAIN
RF_model = RandomForestClassifier(n_estimators=9)
RF_model.fit(X_prepared, y_train)


In [None]:
# Modelni baholaymiz
y_pred = final_model.predict(Y_test_prepared)

print(classification_report(y_test, y_pred))
print("Ko'rsatkichi:", accuracy_score(y_test,y_pred))

## ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

# Confusion matrix
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="g")
plt.show()

In [None]:
# XGBClassifier model TRAIN
XGB_model = XGBClassifier(random_state=42)
XGB_model.fit(X_prepared, y_train)

In [None]:
# Modelni baholaymiz
y_pred = XGB_model.predict(Y_test_prepared)

print(classification_report(y_test, y_pred))
print("Ko'rsatkichi:", accuracy_score(y_test,y_pred))

## ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

# Confusion matrix
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="g")
plt.show()

Eng yaxshi natijaga **XGBClassifier** modelda erishdik aniqlik 95 foizni tashkil qildi.

In [None]:
# KNeighborsClassifier model TRAIN
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_prepared, y_train)

In [None]:
# Modelni baholaymiz
y_pred = knn_model.predict(Y_test_prepared)

print(classification_report(y_test, y_pred))
print("Ko'rsatkichi:", accuracy_score(y_test,y_pred))

## ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

# Confusion matrix
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="g")
plt.show()

In [None]:
abc_model = AdaBoostClassifier()
abc_model.fit(X_prepared, y_train)

In [None]:
# Modelni baholaymiz
y_pred = abc_model.predict(Y_test_prepared)

print(classification_report(y_test, y_pred))
print("Ko'rsatkichi:", accuracy_score(y_test,y_pred))

## ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

# Confusion matrix
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="g")
plt.show()

In [None]:
gbc_model = GradientBoostingClassifier()
gbc_model.fit(X_prepared, y_train)

In [None]:
# Modelni baholaymiz
y_pred = gbc_model.predict(Y_test_prepared)

print(classification_report(y_test, y_pred))
print("Ko'rsatkichi:", accuracy_score(y_test,y_pred))

## ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

# Confusion matrix
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="g")
plt.show()

In [None]:
test_data.head()

In [None]:
test_data_prepared = full_pipeline.transform(test_data)

In [None]:
print(test_data_prepared)

In [None]:
finish_predicted_data = XGB_model.predict(test_data_prepared)

In [None]:
pd.DataFrame({'satisfaction': finish_predicted_data})

In [None]:
df_s = pd.read_csv("/content/sample_submission.csv")
df_s.head()

In [None]:
df = pd.DataFrame({'id': df_s['id'], 'satisfaction': finish_predicted_data})

In [None]:
df.to_csv('sample_submission.csv', index=False)

In [None]:
dft = pd.read_csv("/content/sample_submission.csv")
dft.head()