In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from hyperopt import fmin, tpe, hp, Trials


In [2]:
class CustomOrdinalEncoder:
    def __init__(self, categories):
        self.categories = categories
        self.cat_to_int = {}
        self.int_to_cat = {}
        for i, cat in enumerate(self.categories):
            self.cat_to_int[cat] = i
            self.int_to_cat[i] = cat

    def transform(self, data):
        return np.array([self.cat_to_int[cat] if cat in self.cat_to_int else np.nan for cat in data])

    def inverse_transform(self, data):
        return np.array([self.int_to_cat[int(cat)] for cat in data])

def encode_ordinal_columns(df, ordinal_columns, n_classes):
    encoders = {}
    encoded_df = df.copy()
    for col in ordinal_columns:
        unique_values = sorted(df[col].dropna().unique())
        categories = unique_values + [f"extra_class_{i}" for i in range(n_classes - len(unique_values))]
        encoder = CustomOrdinalEncoder(categories)
        encoded_df[col] = encoder.transform(df[col])
        encoders[col] = encoder
    return encoded_df, encoders

def impute_missing_ordinal_records(df, ordinal_columns, n_classes=5, max_iter=10, random_state=42):
    encoded_df, encoders = encode_ordinal_columns(df, ordinal_columns, n_classes)
    
    imputer = IterativeImputer(max_iter=max_iter, estimator=RandomForestRegressor(random_state=random_state), random_state=random_state)
    imputed_array = imputer.fit_transform(encoded_df)

    imputed_df = pd.DataFrame(imputed_array, columns=df.columns)
    imputed_df[ordinal_columns] = np.round(imputed_df[ordinal_columns])

    for col in ordinal_columns:
        imputed_df[col] = encoders[col].inverse_transform(imputed_df[col])

    return imputed_df

def encode_non_ordinal_columns(df, non_ordinal_columns):
    encoded_df = pd.get_dummies(df, columns=non_ordinal_columns, drop_first=True)
    return encoded_df

def impute_missing_non_ordinal_records(df, max_iter=10, random_state=42):
    imputer = IterativeImputer(max_iter=max_iter, estimator=RandomForestRegressor(random_state=random_state), random_state=random_state)
    imputed_array = imputer.fit_transform(df)

    imputed_df = pd.DataFrame(imputed_array, columns=df.columns)
    return imputed_df

def impute_most_common(df):
    for column in df.columns:
        most_common_value = df[column].mode()[0]
        df[column].fillna(most_common_value, inplace=True)
    return df

# Importing

In [3]:
survey_df = pd.read_csv('Surveydata_train.csv', )
survey_df_test = pd.read_csv('Surveydata_test.csv')

travel_df = pd.read_csv('Traveldata_train.csv')
travel_df_test = pd.read_csv('Traveldata_test.csv')

# Preprocessing

In [4]:
merged_df = pd.merge(survey_df, travel_df, on= 'ID')
merged_df_test = pd.merge(survey_df_test, travel_df_test, on= 'ID')

In [5]:
transformed_df = (
    merged_df
    # 'Seat_comfort', 'Arrival_time_convenient', 'Catering', 'Onboardwifi_service', 'Onboard_entertainment', 'Online_support',
    # 'Onlinebooking_Ease', 'Onboard_service', 'Leg_room', 'Checkin_service', 'Cleanliness', 'Online_boarding'
    .replace(['Excellent', 'Good', 'Acceptable', 'Needs Improvement', 'Poor', 'Extremely Poor'], [5, 4, 3, 2, 1, 0])
    # Platform_location
    .replace(['Very Convenient', 'Convenient', 'Manageable', 'Needs Improvement', 'Inconvenient', 'Very Inconvenient'], [5, 4, 3, 2, 1, 0])
    # Seat_Class
    .replace(['Ordinary', 'Green Car'], [0, 1])
    # Gender
    .replace(['Male', 'Female'], [0, 1])
    # CustomerType
    .replace(['Disloyal Customer', 'Loyal Customer'], [0, 1])
    # TypeTravel
    .replace(['Personal Travel', 'Business Travel'], [0, 1])
    # Travel_Class
    .replace(['Eco', 'Business'], [0, 1])
)

In [6]:
transformed_test_df = (
    merged_df_test
    .replace(['Excellent', 'Good', 'Acceptable', 'Needs Improvement', 'Poor', 'Extremely Poor'], [5, 4, 3, 2, 1, 0])
    .replace(['Very Convenient', 'Convenient', 'Manageable', 'Needs Improvement', 'Inconvenient', 'Very Inconvenient'], [5, 4, 3, 2, 1, 0])
    .replace(['Ordinary', 'Green Car'], [0, 1])
    .replace(['Male', 'Female'], [0, 1])
    .replace(['Disloyal Customer', 'Loyal Customer'], [0, 1])
    .replace(['Personal Travel', 'Business Travel'], [0, 1])
    .replace(['Eco', 'Business'], [0, 1])
)

In [7]:
transformed_df['Age'] = pd.cut(transformed_df['Age'], 5, labels = ['25', '35', '45', '60', '80'])
transformed_test_df['Age'] = pd.cut(transformed_test_df['Age'], 5, labels = ['25', '35', '45', '60', '80'])

### Imputing data

In [8]:
ordinal_columns = [
    'Seat_Comfort', 'Onboard_Wifi_Service', 
    'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 
    'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding'
  ]

In [9]:
categorical_columns = [
    'Customer_Type', 'Travel_Class'
  ]

##### Train data imputation

In [10]:
ordinal_imputed = pd.read_csv('Ordinal_Imputed.csv')

In [11]:
# encoded_non_ordinal_df = encode_non_ordinal_columns(transformed_df[categorical_columns], categorical_columns)
# ordinal_imputed['ID'] = transformed_df['ID']
# encoded_non_ordinal_df['ID'] = transformed_df['ID']

# encoded_df = pd.merge(encoded_non_ordinal_df, ordinal_imputed, on= 'ID')

categorical_imputed = pd.read_csv('Categorical_Imputed.csv')

In [12]:
final_df = encode_non_ordinal_columns(categorical_imputed.copy().drop(['ID'], axis=1), ordinal_columns)

In [13]:
display(final_df.info())
display(len(final_df.columns))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 56 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Customer_Type_1.0           94379 non-null  float64
 1   Travel_Class_1              94379 non-null  float64
 2   Seat_Comfort_1.0            94379 non-null  uint8  
 3   Seat_Comfort_2.0            94379 non-null  uint8  
 4   Seat_Comfort_3.0            94379 non-null  uint8  
 5   Seat_Comfort_4.0            94379 non-null  uint8  
 6   Seat_Comfort_5.0            94379 non-null  uint8  
 7   Onboard_Wifi_Service_1.0    94379 non-null  uint8  
 8   Onboard_Wifi_Service_2.0    94379 non-null  uint8  
 9   Onboard_Wifi_Service_3.0    94379 non-null  uint8  
 10  Onboard_Wifi_Service_4.0    94379 non-null  uint8  
 11  Onboard_Wifi_Service_5.0    94379 non-null  uint8  
 12  Onboard_Entertainment_1.0   94379 non-null  uint8  
 13  Onboard_Entertainment_2.0   943

None

56

##### Test data imputation

In [14]:
ordinal_test_imputed = pd.read_csv('Ordinal_Test_Imputed.csv')

In [15]:
# encoded_test_non_ordinal_df = encode_non_ordinal_columns(transformed_test_df[categorical_columns], categorical_columns)
# ordinal_test_imputed['ID'] = transformed_test_df['ID']
# encoded_test_non_ordinal_df['ID'] = transformed_test_df['ID']

# encoded_test_df = pd.merge(encoded_test_non_ordinal_df, ordinal_test_imputed, on= 'ID')

categorical_test_imputed = pd.read_csv('Categorical_Test_Imputed.csv')

In [16]:
final_test_df = (
    encode_non_ordinal_columns(categorical_test_imputed.copy().drop(['ID'], axis=1), ordinal_columns)
)

final_test_df['CheckIn_Service_1.0'] = 0
final_test_df['Cleanliness_1.0'] = 0
final_test_df['Onboard_Service_1.0'] = 0
final_test_df['Online_Support_1.0'] = 0
final_test_df['Platform_Location_1.0'] = 0

final_test_df = final_test_df[final_df.columns]

In [17]:
display(final_test_df.info())
display(len(final_test_df.columns))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35602 entries, 0 to 35601
Data columns (total 56 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Customer_Type_1.0           35602 non-null  float64
 1   Travel_Class_1              35602 non-null  float64
 2   Seat_Comfort_1.0            35602 non-null  uint8  
 3   Seat_Comfort_2.0            35602 non-null  uint8  
 4   Seat_Comfort_3.0            35602 non-null  uint8  
 5   Seat_Comfort_4.0            35602 non-null  uint8  
 6   Seat_Comfort_5.0            35602 non-null  uint8  
 7   Onboard_Wifi_Service_1.0    35602 non-null  uint8  
 8   Onboard_Wifi_Service_2.0    35602 non-null  uint8  
 9   Onboard_Wifi_Service_3.0    35602 non-null  uint8  
 10  Onboard_Wifi_Service_4.0    35602 non-null  uint8  
 11  Onboard_Wifi_Service_5.0    35602 non-null  uint8  
 12  Onboard_Entertainment_1.0   35602 non-null  uint8  
 13  Onboard_Entertainment_2.0   356

None

56

# Modelling

##### Quick Classifier Selection

In [None]:
X = final_df.copy()
y = transformed_df['Overall_Experience'].copy()

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifiers = [
    ('Logistic Regression', LogisticRegression(solver='liblinear')),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier(n_estimators=100)),
    ('Support Vector Machine', SVC(kernel='linear', C=1)),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('Bernoulli Naive Bayes', BernoulliNB()),
    ('MLP Classifier', MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)),
    ('Stochastic Gradient Descent', SGDClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(eval_metric='mlogloss', random_state=42)),
    ('LightGBM', LGBMClassifier(random_state=42)),
    ('CatBoost', CatBoostClassifier(verbose=0, random_state=42)),
    ('Quadratic Discriminant Analysis', QuadraticDiscriminantAnalysis()),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Ridge', RidgeClassifier(random_state=42)),
    ('AdaBoost', AdaBoostClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

# Iterate through the classifiers, fit, and print accuracy
for name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

**Conclusion**: Selecting best performing algoritms: CatBoost, XGBoost, MLP Classifier and Random Forest for hyperparameter tuning.

##### Hyperparameter Tuning for best performing classifiers with GridSearchCV

In [None]:
# Random Forest
rf_params = {
    'n_estimators': [10, 50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, scoring='accuracy', cv=5, n_jobs=-1)
rf_grid.fit(X_train, y_train)
print(f"Random Forest best parameters: {rf_grid.best_params_}")

In [None]:
# MLP Classifier
mlp_params = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['lbfgs', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [200, 300, 400]
}

mlp_grid = GridSearchCV(MLPClassifier(random_state=42), mlp_params, scoring='accuracy', cv=5, n_jobs=-1)
mlp_grid.fit(X_train, y_train)
print(f"MLP Classifier best parameters: {mlp_grid.best_params_}")

In [None]:
# XGBoost
xgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

xgb_grid = GridSearchCV(XGBClassifier(eval_metric='mlogloss', random_state=42), xgb_params, scoring='accuracy', cv=5, n_jobs=-1)
xgb_grid.fit(X_train, y_train)
print(f"XGBoost best parameters: {xgb_grid.best_params_}")

In [None]:
# CatBoost
cat_params = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [3, 6, 10],
    'l2_leaf_reg': [1, 3, 5]
}

cat_grid = GridSearchCV(CatBoostClassifier(verbose=0, random_state=42), cat_params, scoring='accuracy', cv=5, n_jobs=-1)
cat_grid.fit(X_train, y_train)
print(f"CatBoost best parameters: {cat_grid.best_params_}")

In [None]:
# Evaluate best models on the test set
best_rf = rf_grid.best_estimator_
best_mlp = mlp_grid.best_estimator_
best_xgb = xgb_grid.best_estimator_
best_cat = cat_grid.best_estimator_

for name, clf in [('Random Forest', best_rf), ('MLP Classifier', best_mlp), ('XGBoost', best_xgb), ('CatBoost', best_cat)]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

##### Hyperparameter Tuning for best performing classifiers with Hyperopt (Google Colab)

In [18]:
# Define the objective function for optimization
def objective(args, classifier_name):
    if classifier_name == 'RandomForest':
        n_estimators, max_depth, max_features, min_samples_split, min_samples_leaf = args
        clf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )
    elif classifier_name == 'MLPClassifier':
        hidden_layer_sizes, alpha, activation, solver, learning_rate = args
        clf = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            alpha=alpha,
            activation=activation,
            solver=solver,
            learning_rate=learning_rate,
            random_state=42
        )
    elif classifier_name == 'XGBoost':
        n_estimators, learning_rate, max_depth, gamma, subsample, colsample_bytree = args
        clf = XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            gamma=gamma,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            random_state=42
        )    
    elif classifier_name == 'CatBoost':
        iterations, learning_rate, depth, l2_leaf_reg = args
        clf = CatBoostClassifier(
            iterations=iterations,
            learning_rate=learning_rate,
            depth=depth,
            l2_leaf_reg=l2_leaf_reg,
            random_state=42,
            verbose=0
        )

    score = -np.mean(cross_val_score(clf, X_train, y_train, cv=5, n_jobs=-1))
    return score

# Define extensive hyperparameter search spaces for each classifier
space_rf = [
    hp.choice('n_estimators', range(10, 201, 10)),
    hp.choice('max_depth', list(range(1, 33)) + [None]),
    hp.choice('max_features', ['auto', 'sqrt', 'log2', None] + list(np.arange(0.1, 1.1, 0.1))),
    hp.choice('min_samples_split', range(2, 21)),
    hp.choice('min_samples_leaf', range(1, 21))
]

space_mlp = [
    hp.choice('hidden_layer_sizes', [(i,) for i in range(10, 101, 10)] + [(i, i) for i in range(10, 101, 10)]),
    hp.loguniform('alpha', -5, -1),
    hp.choice('activation', ['identity', 'logistic', 'tanh', 'relu']),
    hp.choice('solver', ['lbfgs', 'sgd', 'adam']),
    hp.choice('learning_rate', ['constant', 'invscaling', 'adaptive'])
]

space_xgb = [
    hp.choice('n_estimators', range(10, 201, 10)),
    hp.loguniform('learning_rate', -5, 0),
    hp.choice('max_depth', list(range(1, 33))),
    hp.loguniform('gamma', -5, 0),
    hp.uniform('subsample', 0.1, 1),
    hp.uniform('colsample_bytree', 0.1, 1)
]

space_cat = [
    hp.choice('iterations', range(10, 201, 10)),
    hp.loguniform('learning_rate', -5, 0),
    hp.choice('depth', list(range(1, 17))),
    hp.loguniform('l2_leaf_reg', 0, 5)
]

# Optimize hyperparameters for each classifier
for classifier_name, space in [('RandomForest', space_rf), ('MLPClassifier', space_mlp), ('XGBoost', space_xgb), ('CatBoost', space_cat)]:
    trials = Trials()
    best = fmin(lambda args: objective(args, classifier_name), space, algo=tpe.suggest, max_evals=200, trials=trials)
    print(f"{classifier_name} best parameters: {best}")

  0%|          | 0/200 [00:00<?, ?trial/s, best loss=?]

job exception: name 'X_train' is not defined



  0%|          | 0/200 [00:00<?, ?trial/s, best loss=?]


NameError: name 'X_train' is not defined

In [23]:
# Found out in Google Colab through Hyperopt
best_rf = {
    'max_depth': 27, 
    'max_features': 0.30000000000000004, 
    'min_samples_leaf': 1, 
    'min_samples_split': 13, 
    'n_estimators': 90
    }

best_mlp = {
    'activation': 'relu',
    'alpha': 0.059008349443448974,
    'hidden_layer_sizes': (30, 30),
    'learning_rate': 'invscaling',
    'solver': 'adam'
}

best_xgb = {
    'colsample_bytree': 0.9334254355105551,
    'gamma': 0.008801002728149786,
    'learning_rate': 0.1257056414560802,
    'max_depth': 9,
    'n_estimators': 170,
    'subsample': 0.9801605605745425
}

best_cat = {
    'l2_leaf_reg': 3.9355845098832787,
    'learning_rate': 0.12846062021329857,
    'depth': 9,
    'iterations': 17
}

In [24]:
def build_classifier(classifier_name, best_params):
    if classifier_name == 'RandomForest':
        clf = RandomForestClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'] if best_params['max_depth'] != 32 else None,
            max_features=best_params['max_features'],
            min_samples_split=best_params['min_samples_split'],
            min_samples_leaf=best_params['min_samples_leaf'],
            random_state=42
        )
    elif classifier_name == 'MLPClassifier':
        clf = MLPClassifier(
            hidden_layer_sizes=best_params['hidden_layer_sizes'],
            alpha=best_params['alpha'],
            activation=best_params['activation'],
            solver=best_params['solver'],
            learning_rate=best_params['learning_rate'],
            random_state=42
        )
    elif classifier_name == 'XGBoost':
        clf = XGBClassifier(
            n_estimators=best_params['n_estimators'],
            learning_rate=best_params['learning_rate'],
            max_depth=best_params['max_depth'],
            gamma=best_params['gamma'],
            subsample=best_params['subsample'],
            colsample_bytree=best_params['colsample_bytree'],
            random_state=42
        )
    elif classifier_name == 'CatBoost':
        clf = CatBoostClassifier(
            iterations=best_params['iterations'],
            learning_rate=best_params['learning_rate'],
            depth=best_params['depth'],
            l2_leaf_reg=best_params['l2_leaf_reg'],
            random_state=42,
            verbose=0
        )

    return clf

In [25]:
X_test = final_test_df.copy()

X_train = final_df.copy()
y_train = transformed_df['Overall_Experience'].copy()

In [26]:
# Store the best parameters for each classifier in a dictionary
best_params_dict = {
    'RandomForest': best_rf,
    'MLPClassifier': best_mlp,
    'XGBoost': best_xgb,
    'CatBoost': best_cat
}

# Store results
result = {}

# Evaluate test set accuracy with the best parameters for each classifier
for classifier_name in best_params_dict:
    # Build the classifier with the best parameters
    clf = build_classifier(classifier_name, best_params_dict[classifier_name])

    # Train the classifier on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    result[classifier_name] = y_pred
    

In [None]:
result_final = pd.DataFrame(data={'ID': range(99900001,99935603), 'Overall_Experience': result['CatBoost']}).set_index('ID').sort_index(ascending=True)
result_final.to_csv('Submission.csv')