In [178]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn import metrics

In [179]:
# Ma'lumotlarni o'rganish

df = pd.read_csv('../input/aviakompania-dataset/train_dataset.csv')
df.head(5)


In [None]:
# ustunlar haqida 
df.info()

In [None]:
# ustun qiymatlari haqida ma'lumot
df.describe()

In [180]:
# id ustunini ham tashlab yuboramiz
df.drop('id',axis=1, inplace=True)

In [181]:
# korretsiya ustuni
plt.figure(figsize=(16,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
df['satisfaction'].value_counts()

In [None]:
df.corrwith(df['satisfaction']).abs().sort_values(ascending=False)

In [None]:
# Data Visualisation

fig, axes = plt.subplots(2,2, figsize=(15,10))

# Online boarding
sns.countplot(x="Online boarding", data=df, ax=axes[0,0], hue=df['Class'])

#Inflight entertainment
sns.histplot(x="Inflight entertainment", data=df, ax=axes[0,1],hue=df['Class'])

# Seat comfort
sns.histplot(x="Seat comfort", data=df, ax=axes[1,0],hue=df['Class'])

#On-board service
sns.histplot(x="On-board service", data=df, ax=axes[1,1],hue=df['Class'])

plt.show()

In [None]:
fig , axes = plt.subplots(1,3, figsize = (16,5))

#Class
sns.countplot(x='Class', hue='satisfaction',palette='viridis',data = df , ax = axes[0])
axes[0].set_title('Class')

#Type of Travel
sns.countplot(x='Type of Travel', hue='satisfaction',palette='viridis',data = df , ax = axes[1])
axes[1].set_title('Gender')

#Customer Type
sns.countplot(x='Customer Type', hue='satisfaction',palette='viridis',data = df , ax = axes[2])
axes[2].set_title('Customer Type')

plt.show()

In [182]:
# yoshga nisbatan qoniqish 

facet = sns.FacetGrid(df, hue='satisfaction', aspect=5)
facet.map(sns.kdeplot,'Age',shade=True)
facet.set(xlim=(0,df['Age'].max()))
facet.add_legend()

plt.show()

In [183]:
# matnli ustunlarni sonlarga almashtiramiz
male_convertion = {'Male':1, 'Female':0}
df['Gender'] = df['Gender'].map(male_convertion)

customer_conv = {'Loyal Customer':1, 'disloyal Customer':0}
df['Customer Type'] = df['Customer Type'].map(customer_conv)

travel_conv = {'Business travel':1, 'Personal Travel':0}
df['Type of Travel'] = df['Type of Travel'].map(travel_conv)

class_conv = {'Business':2, 'Eco Plus':1, 'Eco':0}
df['Class'] = df['Class'].map(class_conv)

In [184]:
# Machine Learning

# StratifiedShuffleSplit orqali nolmalizatsiya qilamiz
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for tr_idx, te_idx in strat_split.split(df, df['satisfaction']):
    strat_train_set = df.iloc[tr_idx]
    strat_test_set = df.iloc[te_idx]

X_train = strat_train_set.drop('satisfaction', axis=1)
y_train = strat_train_set['satisfaction'].copy()

X_test = strat_test_set.drop('satisfaction', axis=1)
y_test = strat_test_set['satisfaction'].copy()


In [185]:
# Pipeline orqali sonlarni standart qiymatga keltiramiz

full_pipeline = Pipeline([
            ('median_imputer', SimpleImputer(strategy='median')),
            ('std_scaler', StandardScaler())
])

# train va test setlarni pipeline dan o'tkazib olamiz
X_train_prep = full_pipeline.fit_transform(X_train)
X_test_prep = full_pipeline.fit_transform(X_test)

In [186]:
# Logistic regression

## Model yaratish
RF_model = LogisticRegression()
RF_model.fit(X_train_prep, y_train)

## Modelni baholash
y_pred = RF_model.predict(X_test_prep)
print(metrics.classification_report(y_test, y_pred))
print(f"Modelni aniqligi: {metrics.accuracy_score(y_test,y_pred):0.3f}")

## Confusion matrix
conf_mat = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat,annot=True, fmt='g')
plt.show()

## ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr = tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

In [187]:
# Support Vector Machine

## Model yaratish
svm_model = SVC()
svm_model.fit(X_train_prep, y_train)

## Modelni baholash
y_pred = svm_model.predict(X_test_prep)
print(metrics.classification_report(y_test, y_pred))
print(f"Modelni aniqligi: {metrics.accuracy_score(y_test,y_pred):0.3f}")

## Confusion matrix
conf_mat = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat,annot=True, fmt='g')
plt.show()

## ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr = tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

In [188]:
# Decision Tree

## Model yaratish
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train_prep, y_train)

## Modelni baholash
y_pred = tree_model.predict(X_test_prep)
print(metrics.classification_report(y_test, y_pred))
print(f"Modelni aniqligi: {metrics.accuracy_score(y_test,y_pred):0.3f}")

## Confusion matrix
conf_mat = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat,annot=True, fmt='g')
plt.show()

## ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr = tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()

In [192]:
# # k ni topish

f1 = []
for k in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=k) # k-ni qiymati
    knn.fit(X_train_prep, y_train)
    y_predict = knn.predict(X_test_prep)
    f1.append(f1_score(y_test, y_predict))

plt.figure(figsize=(10,6))
plt.plot(range(1,30),f1)
plt.xticks(range(1,30))
plt.grid()
plt.show()

In [194]:
# KNN modeli

## Model yaratish
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_prep, y_train)

## Modelni baholash
y_pred = knn_model.predict(X_test_prep)
print(metrics.classification_report(y_test, y_pred))
print(f"Modelni aniqligi: {metrics.accuracy_score(y_test,y_pred):0.3f}")

## Confusion matrix
conf_mat = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat,annot=True, fmt='g')
plt.show()

## ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr = tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()



In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {'n_neighbors': np.arange(1, 25)}
# knn_gscv = GridSearchCV(knn, param_grid, cv=5)
# knn_gscv.fit(X_train_prep, y_train)
#knn_gscv.best_params_

In [201]:
# Random Forest modeli

## Model yaratish
RFF_model = RandomForestClassifier()
RFF_model.fit(X_train_prep, y_train)

## Modelni baholash
y_pred = RFF_model.predict(X_test_prep)
print(metrics.classification_report(y_test, y_pred))
print("Modelni aniqligi: ", metrics.accuracy_score(y_test,y_pred))

## Confusion matrix
conf_mat = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat,annot=True, fmt='g')
plt.show()

## ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr = tpr, roc_auc=roc_auc, estimator_name='ROC curve')
display.plot()
plt.show()


In [None]:
# Eng yaxshi natija Random Forest modeli orqali olindi.

In [202]:
# Submission

test_df = pd.read_csv('../input/aviakompania-dataset/test_dataset.csv')
test_df.head(5)

In [203]:
# test_df dataframe dan id ustunini tashlab yuboramiz
test_df.drop(['id'], axis=1, inplace=True)

In [204]:
# NaN qiymatlarni tekshiramiz
test_df.isnull().sum()

In [206]:
# Nan qiymatlarni median qiymatlar bilan to'ldiramiz
test_df['Arrival Delay in Minutes'] = test_df['Arrival Delay in Minutes'].fillna(np.median(test_df['Arrival Delay in Minutes']))

In [207]:
# matnli ustunlarni sonlarga almashtiramiz
test_df['Gender'] = test_df['Gender'].map(male_convertion)
test_df['Customer Type'] = test_df['Customer Type'].map(customer_conv)
test_df['Type of Travel'] = test_df['Type of Travel'].map(travel_conv)
test_df['Class'] = test_df['Class'].map(class_conv)

In [208]:
# pipelinedan o'tkazamiz
test_df_prapared =num_pipeline.fit_transform(test_df)

In [209]:
# modelni bashorat qilamiz
test_predicted = RFF_model.predict(test_df_prapared)

In [210]:
sample_submission = pd.read_csv('../input/aviakompania-dataset/sample_submission.csv')
sample_submission.head(5)

In [211]:
sample_submission['satisfaction'] = test_predicted

In [212]:
sample_submission.head()

In [214]:
# yakuniy jadvalni submission.csv sifatida saqlaymiz
sample_submission.to_csv('submission.csv',index=False)