In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('airline_reviews_cleaned.csv')
df.head()

Unnamed: 0,airline,traveller_type,cabin,type_of_flight,frequency,seat_comfort,cabin_service,food_bev,entertainment,ground_service,value_for_money,overall,recommended
0,Turkish Airlines,Business,Economy Class,Indirect,Rarely,4.0,5.0,4.0,4.0,2.0,4.0,7.0,yes
1,Turkish Airlines,Family Leisure,Economy Class,Direct,Rarely,4.0,1.0,1.0,1.0,1.0,1.0,2.0,no
2,Turkish Airlines,Business,Economy Class,Indirect,Rarely,1.0,4.0,1.0,3.0,1.0,2.0,3.0,no
3,Turkish Airlines,Solo Leisure,Economy Class,Direct,Rarely,4.0,5.0,5.0,5.0,5.0,5.0,10.0,yes
4,Turkish Airlines,Solo Leisure,Economy Class,Indirect,Rarely,1.0,1.0,1.0,1.0,1.0,1.0,1.0,no


In [4]:
X = df.drop(columns = "recommended")
y = df["recommended"]

In [5]:
X.columns

Index(['airline', 'traveller_type', 'cabin', 'type_of_flight', 'frequency',
       'seat_comfort', 'cabin_service', 'food_bev', 'entertainment',
       'ground_service', 'value_for_money', 'overall'],
      dtype='object')

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 24)

# Printing shapes of train and test data
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}\n')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (18244, 12)
Shape of y_train: (18244,)

Shape of X_test: (4562, 12)
Shape of y_test: (4562,)


In [7]:
y_train

521      yes
4579     yes
14092    yes
4018      no
11874    yes
        ... 
21633    yes
19857     no
14528     no
899      yes
12706     no
Name: recommended, Length: 18244, dtype: object

# Column Transformer

### Handling Categorical data

In [8]:
ct_encoding = ColumnTransformer(transformers = [
    ('ohe_enc', OneHotEncoder(handle_unknown = "ignore", sparse_output = False), [0,1]),
    ("ord_enc", OrdinalEncoder(categories = [["Economy Class", "Premium Economy", "Business Class", "First Class"], ["Direct", "Indirect"], ["Rarely", "Occasionally", "Often"]], encoded_missing_value = 0), [2,3,4]),
], remainder = 'passthrough')

In [9]:
ct_encoding.fit_transform(X_train)

array([[ 0.,  0.,  0., ...,  5.,  5., 10.],
       [ 0.,  0.,  0., ...,  5.,  5.,  9.],
       [ 0.,  0.,  0., ...,  3.,  5.,  9.],
       ...,
       [ 0.,  0.,  0., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  5.,  4.,  8.],
       [ 0.,  0.,  0., ...,  1.,  1.,  2.]])

In [10]:
ct_encoding.fit_transform(X_train).shape

(18244, 80)

### Defining Model

In [11]:
model = RandomForestClassifier(criterion = "gini", max_depth=14, max_features='log2', random_state = 12)

# Creating Pipeline

In [12]:
# Creating pipeline
pipe = Pipeline(steps = [
    ('encoding' , ct_encoding),
    ('model_deploy', model)
])

In [13]:
pipe.fit(X_train, y_train)

# Exporting pipeline model as joblib file

In [14]:
# Importing joblib and dumping the model in a joblib file
import joblib
joblib.dump(pipe, 'ml_model.joblib')

['ml_model.joblib']

In [15]:
# Loading the joblib file and creating a new model using it
new_model = joblib.load('ml_model.joblib')

In [16]:
y_pred = new_model.predict(X_test)

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          no       0.95      0.96      0.96      2264
         yes       0.96      0.95      0.96      2298

    accuracy                           0.96      4562
   macro avg       0.96      0.96      0.96      4562
weighted avg       0.96      0.96      0.96      4562

