In [38]:
import pandas as pd
import numpy as np

In [39]:
df = pd.read_csv('data/AirPass.csv')
df = df.drop(df.columns[[0]], axis=1)
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [40]:
df.isnull().sum().sum()

310

In [41]:
df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median())
df['Arrival Delay in Minutes'].mean()

15.133392362180475

In [42]:
df.groupby('Gender')['satisfaction'].value_counts()*100/df.shape[0]

Gender  satisfaction           
Female  neutral or dissatisfied    29.058554
        satisfied                  21.687327
Male    neutral or dissatisfied    27.608177
        satisfied                  21.645942
Name: satisfaction, dtype: float64

In [43]:
df.groupby('Type of Travel')['satisfaction'].value_counts()*100/df.shape[0]

Type of Travel   satisfaction           
Business travel  satisfied                  40.177472
                 neutral or dissatisfied    28.785225
Personal Travel  neutral or dissatisfied    27.881506
                 satisfied                   3.155798
Name: satisfaction, dtype: float64

In [44]:
df.groupby('Class')['satisfaction'].value_counts()*100/df.shape[0]

Class     satisfaction           
Business  satisfied                  33.184478
          neutral or dissatisfied    14.614452
Eco       neutral or dissatisfied    36.614567
          satisfied                   8.374076
Eco Plus  neutral or dissatisfied     5.437712
          satisfied                   1.774715
Name: satisfaction, dtype: float64

In [45]:
df['satisfaction'] = df['satisfaction'].map({'neutral or dissatisfied':0 , 'satisfied':1})
df['Customer Type'] = df['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
df['Type of Travel'] = df['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})

In [46]:
df=pd.get_dummies(df)
df.shape

(103904, 27)

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
X = df.drop('satisfaction', axis=1)
y = df['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=26)
y_test.shape

(20781,)

In [49]:
from sklearn.preprocessing import StandardScaler

In [50]:
scaler = StandardScaler()
scaler.fit(X_train) 
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 
X_test[0][0]

0.9408251379303

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [55]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
preds_test = model_lr.predict(X_test)
f1_score(preds_test, y_test)

0.8546883773161146

In [57]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [58]:
model_ada = AdaBoostClassifier(DecisionTreeClassifier(random_state=26),random_state=26,learning_rate=0.01)

model_ada.fit(X_train, y_train)
preds_test = model_ada.predict(X_test)
f1_score(preds_test, y_test)

0.9398901098901099

In [61]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [62]:
model_for_gs = GradientBoostingClassifier()
params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}
gs = GridSearchCV(model_for_gs, 
                  params, 
                  cv=3, 
                  scoring=make_scorer(f1_score),
                  verbose=5)
 
gs.fit(X_train, y_train)
 
print("Лучшие гиперпараметры:", gs.best_params_)
print("Лучшее значение метрики:", gs.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .learning_rate=1.0, n_estimators=1;, score=0.873 total time=   0.1s
[CV 2/3] END .learning_rate=1.0, n_estimators=1;, score=0.870 total time=   0.1s
[CV 3/3] END .learning_rate=1.0, n_estimators=1;, score=0.871 total time=   0.1s
[CV 1/3] END .learning_rate=1.0, n_estimators=2;, score=0.880 total time=   0.2s
[CV 2/3] END .learning_rate=1.0, n_estimators=2;, score=0.878 total time=   0.2s
[CV 3/3] END .learning_rate=1.0, n_estimators=2;, score=0.875 total time=   0.2s
[CV 1/3] END .learning_rate=1.0, n_estimators=4;, score=0.901 total time=   0.6s
[CV 2/3] END .learning_rate=1.0, n_estimators=4;, score=0.896 total time=   0.5s
[CV 3/3] END .learning_rate=1.0, n_estimators=4;, score=0.897 total time=   0.5s
[CV 1/3] END .learning_rate=1.0, n_estimators=8;, score=0.920 total time=   1.1s
[CV 2/3] END .learning_rate=1.0, n_estimators=8;, score=0.920 total time=   1.1s
[CV 3/3] END .learning_rate=1.0, n_estimators=8;

In [63]:
import xgboost as xgb

In [64]:
model_xgb = xgb.XGBClassifier(random_state =26)
model_xgb.fit(X_train,y_train)
preds_test = model_xgb.predict(X_test)
f1_score(preds_test, y_test)

0.9579785161685312

In [65]:
import catboost as cb

In [66]:
model = cb.CatBoostClassifier()
model.fit(X_train, y_train)
preds_class = model.predict(X_test)
f1_score(preds_class, y_test)

Learning rate set to 0.068023
0:	learn: 0.6018089	total: 278ms	remaining: 4m 37s
1:	learn: 0.5020769	total: 317ms	remaining: 2m 38s
2:	learn: 0.4472534	total: 358ms	remaining: 1m 58s
3:	learn: 0.4096076	total: 395ms	remaining: 1m 38s
4:	learn: 0.3625895	total: 431ms	remaining: 1m 25s
5:	learn: 0.3353514	total: 472ms	remaining: 1m 18s
6:	learn: 0.3077176	total: 511ms	remaining: 1m 12s
7:	learn: 0.2921075	total: 550ms	remaining: 1m 8s
8:	learn: 0.2790148	total: 592ms	remaining: 1m 5s
9:	learn: 0.2644624	total: 633ms	remaining: 1m 2s
10:	learn: 0.2486163	total: 679ms	remaining: 1m 1s
11:	learn: 0.2333942	total: 721ms	remaining: 59.4s
12:	learn: 0.2255868	total: 759ms	remaining: 57.6s
13:	learn: 0.2155886	total: 795ms	remaining: 56s
14:	learn: 0.2061542	total: 833ms	remaining: 54.7s
15:	learn: 0.2008751	total: 862ms	remaining: 53s
16:	learn: 0.1953162	total: 900ms	remaining: 52.1s
17:	learn: 0.1889280	total: 930ms	remaining: 50.7s
18:	learn: 0.1829713	total: 954ms	remaining: 49.2s
19:	lear

0.9609162172680269

In [73]:
from catboost import Pool
from catboost.utils import get_confusion_matrix

In [76]:
get_confusion_matrix(model, Pool(X_train, y_train))

array([[46668.,   538.],
       [ 1257., 34660.]])

In [79]:
model.get_feature_importance()

array([ 2.02572682,  6.89533637,  3.7457528 , 18.99124878,  1.79135545,
       25.40607543,  1.24239086,  1.71104091,  3.45329441,  0.35406226,
        7.04387768,  2.9637775 ,  2.65578593,  1.63338212,  1.10325229,
        3.47425545,  3.8556973 ,  2.80296381,  1.70388855,  0.49689498,
        0.87505592,  0.07032819,  0.        ,  5.30915408,  0.2191181 ,
        0.17628403])

In [81]:
df.columns

Index(['id', 'Customer Type', 'Age', 'Type of Travel', 'Flight Distance',
       'Inflight wifi service', 'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction', 'Gender_Female', 'Gender_Male', 'Class_Business',
       'Class_Eco', 'Class_Eco Plus'],
      dtype='object')

In [80]:
pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names':df.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

ValueError: All arrays must be of the same length