In [111]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization, Dropout, Dense,  ReLU
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


In [2]:
class CustomOrdinalEncoder:
    def __init__(self, categories):
        self.categories = categories
        self.cat_to_int = {}
        self.int_to_cat = {}
        for i, cat in enumerate(self.categories):
            self.cat_to_int[cat] = i
            self.int_to_cat[i] = cat

    def transform(self, data):
        return np.array([self.cat_to_int[cat] if cat in self.cat_to_int else np.nan for cat in data])

    def inverse_transform(self, data):
        return np.array([self.int_to_cat[int(cat)] for cat in data])

def encode_ordinal_columns(df, ordinal_columns, n_classes):
    encoders = {}
    encoded_df = df.copy()
    for col in ordinal_columns:
        unique_values = sorted(df[col].dropna().unique())
        categories = unique_values + [f"extra_class_{i}" for i in range(n_classes - len(unique_values))]
        encoder = CustomOrdinalEncoder(categories)
        encoded_df[col] = encoder.transform(df[col])
        encoders[col] = encoder
    return encoded_df, encoders

def impute_missing_ordinal_records(df, ordinal_columns, n_classes=5, max_iter=10, random_state=42):
    encoded_df, encoders = encode_ordinal_columns(df, ordinal_columns, n_classes)
    
    imputer = IterativeImputer(max_iter=max_iter, estimator=RandomForestRegressor(random_state=random_state), random_state=random_state)
    imputed_array = imputer.fit_transform(encoded_df)

    imputed_df = pd.DataFrame(imputed_array, columns=df.columns)
    imputed_df[ordinal_columns] = np.round(imputed_df[ordinal_columns])

    for col in ordinal_columns:
        imputed_df[col] = encoders[col].inverse_transform(imputed_df[col])

    return imputed_df

def encode_non_ordinal_columns(df, non_ordinal_columns):
    encoded_df = pd.get_dummies(df, columns=non_ordinal_columns, drop_first=True)
    return encoded_df

def impute_missing_non_ordinal_records(df, max_iter=10, random_state=42):
    imputer = IterativeImputer(max_iter=max_iter, estimator=RandomForestRegressor(random_state=random_state), random_state=random_state)
    imputed_array = imputer.fit_transform(df)

    imputed_df = pd.DataFrame(imputed_array, columns=df.columns)
    return imputed_df

def impute_most_common(df):
    for column in df.columns:
        most_common_value = df[column].mode()[0]
        df[column].fillna(most_common_value, inplace=True)
    return df

# Importing

In [3]:
survey_df = pd.read_csv('Surveydata_train.csv', )
survey_df_test = pd.read_csv('Surveydata_test.csv')

travel_df = pd.read_csv('Traveldata_train.csv')
travel_df_test = pd.read_csv('Traveldata_test.csv')

# Preprocessing

In [4]:
merged_df = pd.merge(survey_df, travel_df, on= 'ID')
merged_df_test = pd.merge(survey_df_test, travel_df_test, on= 'ID')

In [5]:
merged_df['Type_Travel'].value_counts()

Business Travel    58617
Personal Travel    26536
Name: Type_Travel, dtype: int64

In [6]:
transformed_df = (
    merged_df
    # 'Seat_comfort', 'Arrival_time_convenient', 'Catering', 'Onboardwifi_service', 'Onboard_entertainment', 'Online_support',
    # 'Onlinebooking_Ease', 'Onboard_service', 'Leg_room', 'Checkin_service', 'Cleanliness', 'Online_boarding'
    .replace(['Excellent', 'Good', 'Acceptable', 'Needs Improvement', 'Poor', 'Extremely Poor'], [5, 4, 3, 2, 1, 0])
    # Platform_location
    .replace(['Very Convenient', 'Convenient', 'Manageable', 'Needs Improvement', 'Inconvenient', 'Very Inconvenient'], [5, 4, 3, 2, 1, 0])
    # Seat_Class
    .replace(['Ordinary', 'Green Car'], [0, 1])
    # Gender
    .replace(['Male', 'Female'], [0, 1])
    # CustomerType
    .replace(['Disloyal Customer', 'Loyal Customer'], [0, 1])
    # TypeTravel
    .replace(['Personal Travel', 'Business Travel'], [0, 1])
    # Travel_Class
    .replace(['Eco', 'Business'], [0, 1])
)

In [7]:
transformed_test_df = (
    merged_df_test
    .replace(['Excellent', 'Good', 'Acceptable', 'Needs Improvement', 'Poor', 'Extremely Poor'], [5, 4, 3, 2, 1, 0])
    .replace(['Very Convenient', 'Convenient', 'Manageable', 'Needs Improvement', 'Inconvenient', 'Very Inconvenient'], [5, 4, 3, 2, 1, 0])
    .replace(['Ordinary', 'Green Car'], [0, 1])
    .replace(['Male', 'Female'], [0, 1])
    .replace(['Disloyal Customer', 'Loyal Customer'], [0, 1])
    .replace(['Personal Travel', 'Business Travel'], [0, 1])
    .replace(['Eco', 'Business'], [0, 1])
)

In [8]:
transformed_df['Age'] = pd.cut(transformed_df['Age'], 5, labels = ['25', '35', '45', '60', '80'])
transformed_test_df['Age'] = pd.cut(transformed_test_df['Age'], 5, labels = ['25', '35', '45', '60', '80'])

### Imputing data

In [9]:
ordinal_columns = [
    'Seat_Comfort', 'Arrival_Time_Convenient', 'Catering', 'Platform_Location', 'Onboard_Wifi_Service', 
    'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 
    'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding'
    ]

In [10]:
categorical_columns = [
    'Seat_Class', 'Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class', 'Age'
]

##### Train data imputation

In [11]:
ordinal_imputed = impute_missing_ordinal_records(transformed_df[ordinal_columns], ordinal_columns, n_classes=6)



In [12]:
encoded_non_ordinal_df = encode_non_ordinal_columns(transformed_df[categorical_columns], categorical_columns)
ordinal_imputed['ID'] = transformed_df['ID']
encoded_non_ordinal_df['ID'] = transformed_df['ID']

encoded_df = pd.merge(encoded_non_ordinal_df, ordinal_imputed, on= 'ID')

categorical_imputed = impute_missing_non_ordinal_records(encoded_df)

In [69]:
final_df = encode_non_ordinal_columns(categorical_imputed.copy().drop(['ID'], axis=1), ordinal_columns)

In [70]:
display(final_df.info())
display(len(final_df.columns))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 78 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Seat_Class_1                 94379 non-null  float64
 1   Gender_1.0                   94379 non-null  float64
 2   Customer_Type_1.0            94379 non-null  float64
 3   Type_Travel_1.0              94379 non-null  float64
 4   Travel_Class_1               94379 non-null  float64
 5   Age_35                       94379 non-null  float64
 6   Age_45                       94379 non-null  float64
 7   Age_60                       94379 non-null  float64
 8   Age_80                       94379 non-null  float64
 9   Seat_Comfort_1.0             94379 non-null  uint8  
 10  Seat_Comfort_2.0             94379 non-null  uint8  
 11  Seat_Comfort_3.0             94379 non-null  uint8  
 12  Seat_Comfort_4.0             94379 non-null  uint8  
 13  Seat_Comfort_5.0

None

78

##### Test data imputation

In [15]:
ordinal_test_imputed = impute_missing_ordinal_records(transformed_test_df[ordinal_columns], ordinal_columns, n_classes=6)



In [16]:
encoded_test_non_ordinal_df = encode_non_ordinal_columns(transformed_test_df[categorical_columns], categorical_columns)
ordinal_test_imputed['ID'] = transformed_test_df['ID']
encoded_test_non_ordinal_df['ID'] = transformed_test_df['ID']

encoded_test_df = pd.merge(encoded_test_non_ordinal_df, ordinal_test_imputed, on= 'ID')

categorical_test_imputed = impute_missing_non_ordinal_records(encoded_test_df)

In [73]:
final_test_df = (
    encode_non_ordinal_columns(categorical_test_imputed.copy().drop(['ID'], axis=1), ordinal_columns)
)

final_test_df['CheckIn_Service_1.0'] = 0
final_test_df['Cleanliness_1.0'] = 0
final_test_df['Onboard_Service_1.0'] = 0
final_test_df['Online_Support_1.0'] = 0
final_test_df['Platform_Location_1.0'] = 0

In [74]:
display(final_test_df.info())
display(len(final_test_df.columns))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35602 entries, 0 to 35601
Data columns (total 78 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Seat_Class_1                 35602 non-null  float64
 1   Gender_1.0                   35602 non-null  float64
 2   Customer_Type_1.0            35602 non-null  float64
 3   Type_Travel_1.0              35602 non-null  float64
 4   Travel_Class_1               35602 non-null  float64
 5   Age_35                       35602 non-null  float64
 6   Age_45                       35602 non-null  float64
 7   Age_60                       35602 non-null  float64
 8   Age_80                       35602 non-null  float64
 9   Seat_Comfort_1.0             35602 non-null  uint8  
 10  Seat_Comfort_2.0             35602 non-null  uint8  
 11  Seat_Comfort_3.0             35602 non-null  uint8  
 12  Seat_Comfort_4.0             35602 non-null  uint8  
 13  Seat_Comfort_5.0

None

78

### Normalizing

In [75]:
standard_scaler = StandardScaler()
standard_scaled_data = standard_scaler.fit_transform(final_df)

In [76]:
standard_scaler.fit(final_df)

X_train_standard_scaled = standard_scaler.transform(final_df)
X_test_standard_scaled = standard_scaler.transform(final_test_df)

Feature names must be in the same order as they were in fit.



# Modelling

In [118]:
def create_mlp_model(num_features, num_classes):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(num_features,)),
        Dropout(0.4),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])

    # Compile the model with Adam optimizer and a learning rate of 0.01
    model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [119]:
model = create_mlp_model(78, 2)

X = X_train_standard_scaled
y = transformed_df['Overall_Experience']

In [120]:
# Convert target labels to one-hot encoded categorical format
y_categorical = to_categorical(y, num_classes=2)

# Split the data into training and validation sets
split_index = int(94380 * 0.8)
X_train, X_val = X[:split_index], X[split_index:]
y_train, y_val = y_categorical[:split_index], y_categorical[split_index:]

# Train the model
epochs = 200
batch_size = 32

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, min_lr=1e-5)

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 17: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 37: ReduceLROnPlateau reducing learning rate to 9.999999310821295e-05.
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 51: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200


In [121]:
# Make predictions on the test data
predictions = model.predict(X_test_standard_scaled)

# The predictions variable contains probability distributions over the classes for each test sample
# You can convert these probabilities to class labels by choosing the class with the highest probability
predicted_labels = np.argmax(predictions, axis=1)



In [122]:
result = pd.DataFrame(data={'ID': range(99900001,99935603), 'Overall_Experience': predicted_labels}).set_index('ID').sort_index(ascending=True)
result

Unnamed: 0_level_0,Overall_Experience
ID,Unnamed: 1_level_1
99900001,0
99900002,1
99900003,1
99900004,0
99900005,1
...,...
99935598,0
99935599,0
99935600,0
99935601,1


In [123]:
result.to_csv('Sample_Submission.csv')