In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
class CustomOrdinalEncoder:
    def __init__(self, categories):
        self.categories = categories
        self.cat_to_int = {}
        self.int_to_cat = {}
        for i, cat in enumerate(self.categories):
            self.cat_to_int[cat] = i
            self.int_to_cat[i] = cat

    def transform(self, data):
        return np.array([self.cat_to_int[cat] if cat in self.cat_to_int else np.nan for cat in data])

    def inverse_transform(self, data):
        return np.array([self.int_to_cat[int(cat)] for cat in data])

def encode_ordinal_columns(df, ordinal_columns, n_classes):
    encoders = {}
    encoded_df = df.copy()
    for col in ordinal_columns:
        unique_values = sorted(df[col].dropna().unique())
        categories = unique_values + [f"extra_class_{i}" for i in range(n_classes - len(unique_values))]
        encoder = CustomOrdinalEncoder(categories)
        encoded_df[col] = encoder.transform(df[col])
        encoders[col] = encoder
    return encoded_df, encoders

def impute_missing_ordinal_records(df, ordinal_columns, n_classes=5, max_iter=10, random_state=42):
    encoded_df, encoders = encode_ordinal_columns(df, ordinal_columns, n_classes)
    
    imputer = IterativeImputer(max_iter=max_iter, estimator=RandomForestRegressor(random_state=random_state), random_state=random_state)
    imputed_array = imputer.fit_transform(encoded_df)

    imputed_df = pd.DataFrame(imputed_array, columns=df.columns)
    imputed_df[ordinal_columns] = np.round(imputed_df[ordinal_columns])

    for col in ordinal_columns:
        imputed_df[col] = encoders[col].inverse_transform(imputed_df[col])

    return imputed_df

def encode_non_ordinal_columns(df, non_ordinal_columns):
    encoded_df = pd.get_dummies(df, columns=non_ordinal_columns, drop_first=True)
    return encoded_df

def impute_missing_non_ordinal_records(df, max_iter=10, random_state=42):
    imputer = IterativeImputer(max_iter=max_iter, estimator=RandomForestRegressor(random_state=random_state), random_state=random_state)
    imputed_array = imputer.fit_transform(df)

    imputed_df = pd.DataFrame(imputed_array, columns=df.columns)
    return imputed_df

def impute_most_common(df):
    for column in df.columns:
        most_common_value = df[column].mode()[0]
        df[column].fillna(most_common_value, inplace=True)
    return df

# Custom colors
class clr:
    S = '\033[1m' + '\033[92m'
    E = '\033[0m'

# The custom colors chosen here were generated by 'I want hue' app. 
# The were chosen for color-blindness.
our_colors = ["#af953c", "#6971c9", "#56ae6c",
             "#a24f99", "#ba4a4f"]

# Optionally, create a color map, particularly for future use.
CMAP1 = ListedColormap(our_colors)

# Display our own color scheme, as a reference.
print(clr.S+'Notebook Color Scheme:\n'+clr.E)
sns.palplot(sns.color_palette(our_colors))
plt.show()

In [None]:
survey_df = pd.read_csv("Surveydata_train.csv")
survey_df_test = pd.read_csv("Surveydata_test.csv")
display(survey_df.head())
display(survey_df_test.head())

In [None]:
display(survey_df.info())
display(survey_df_test.info())

In [None]:
display(survey_df.iloc[:,2:17].describe(include = 'all'))
display(survey_df_test.iloc[:,1:16].describe(include = 'all'))

In [None]:
display(survey_df.isna().sum())
display(survey_df_test.isna().sum())

In [None]:
travel_df = pd.read_csv("Traveldata_train.csv")
display(travel_df.head())
travel_df_test = pd.read_csv("Traveldata_test.csv")
display(travel_df_test.head())

In [None]:
display(travel_df.info())
display(travel_df_test.info())

In [None]:
display(travel_df.iloc[:,1:9].describe(include ='all'))
display(travel_df_test.iloc[:,1:9].describe(include ='all'))

In [None]:
display(travel_df.isna().sum())
display(travel_df_test.isna().sum())

In [None]:
# Merge the two datasets

merged_df = pd.merge(survey_df, travel_df, on= 'ID')
display(merged_df.head())

merged_df_test = pd.merge(survey_df_test, travel_df_test, on= 'ID')
display(merged_df_test.head())

In [None]:
# Shape of the original and merged data
display(travel_df.shape)
display(survey_df.shape)
display(merged_df.shape)

## 

In [None]:
display(merged_df.info())
display(merged_df_test.info())

In [None]:
display(merged_df.isna().sum())
display(merged_df_test.isna().sum())

In [None]:
transformed_df = (
    merged_df
    # 'Seat_comfort', 'Arrival_time_convenient', 'Catering', 'Onboardwifi_service', 'Onboard_entertainment', 'Online_support',
    # 'Onlinebooking_Ease', 'Onboard_service', 'Leg_room', 'Checkin_service', 'Cleanliness', 'Online_boarding'
    .replace(['excellent', 'good', 'acceptable', 'need improvement', 'poor', 'extremely poor'], [5, 4, 3, 2, 1, 0])
    # Platform_location
    .replace(['very convinient', 'Convinient', 'manageable', 'need improvement', 'Inconvinient', 'very inconvinient'], [5, 4, 3, 2, 1, 0])
    # Seat_Class
    .replace(['Ordinary', 'Green Car'], [0, 1])
    # Baggage_handling
    .replace(['need improvement', 'poor', 'excellent', 'acceptable', 'good'], [2, 1, 5, 3, 4])
    # Gender
    .replace(['Male', 'Female'], [0, 1])
    # CustomerType
    .replace(['disloyal Customer', 'Loyal Customer'], [0, 1])
    # TypeTravel
    .replace(['Personal Travel', 'Business travel'], [0, 1])
    # Travel_Class
    .replace(['Eco', 'Business'], [0, 1])
)

In [None]:
transformed_test_df = (
    merged_df_test
    .replace(['excellent', 'good', 'acceptable', 'need improvement', 'poor', 'extremely poor'], [5, 4, 3, 2, 1, 0])
    .replace(['very convinient', 'Convinient', 'manageable', 'need improvement', 'Inconvinient', 'very inconvinient'], [5, 4, 3, 2, 1, 0])
    .replace(['Ordinary', 'Green Car'], [0, 1])
    .replace(['need improvement', 'poor', 'excellent', 'acceptable', 'good'], [2, 1, 5, 3, 4])
    .replace(['Male', 'Female'], [0, 1])
    .replace(['disloyal Customer', 'Loyal Customer'], [0, 1])
    .replace(['Personal Travel', 'Business travel'], [0, 1])
    .replace(['Eco', 'Business'], [0, 1])
)

## Checking for distinct values of the data.

Looking at the heatmap, we see the correlation among the various features. Grouping together the values of these correlated features gives as an indication of how we may imput the missing values.

A better idea would probably be to plot these...


In [None]:
# Overall_Experience and Onboard_entertainment
merged_df.groupby(['Overall_Experience', 'Onboard_entertainment'])['Overall_Experience', 'Onboard_entertainment'].value_counts().reset_index()

## Drop data or impute?

In [None]:
display(len(merged_df))
display(len(merged_df.dropna()))

**Conclusion:** By dropping data, we'd loose almost half the dataset. Ramona's researched suggested a multivariate iterative imputer.

In [None]:
ordinal_columns = [
    'Seat_comfort', 'Arrival_time_convenient', 'Catering', 'Platform_location', 'Onboardwifi_service', 
    'Onboard_entertainment', 'Online_support', 'Onlinebooking_Ease', 'Onboard_service', 
    'Leg_room', 'Baggage_handling', 'Checkin_service', 'Cleanliness', 'Online_boarding'
    ]

In [None]:
ordinal_imputed = impute_missing_ordinal_records(transformed_df[ordinal_columns], ordinal_columns, n_classes=6)

In [None]:
categorical_columns = [
    'Seat_Class', 'Gender', 'CustomerType', 'TypeTravel', 'Travel_Class'
]

In [None]:
encoded_non_ordinal_df = encode_non_ordinal_columns(transformed_df[categorical_columns], categorical_columns)

encoded_df = pd.concat([encoded_non_ordinal_df, ordinal_imputed], axis=1)

categorical_imputed = impute_missing_non_ordinal_records(encoded_df)

In [None]:
numerical_data = [
    'ID', 'Overall_Experience', 'Age', 'Travel_Distance', 'DepartureDelay_in_Mins', 'ArrivalDelay_in_Mins'
]

In [None]:
transformed_df[numerical_data].isna().sum()

In [None]:
final_df = impute_most_common(pd.concat([categorical_imputed, transformed_df[numerical_data]], axis=1))

**Conclusions**: 
 - Categorical and non-categorical data were imputed with a multivariate imputer.
 - Remaining missing numerical values (approx. 500) were imputed with the most common value.

### Imputing Cleanliness

In [None]:
merged_df[merged_df['Cleanliness'].isna()]

**Conclusion**: Missing Cleanliness values are predominantly for fairly poor experiences: 
 - Overall_Experience 1
 - Seat_comfort extremely poor
 - Seat_Class Green Car
 - Arrival_time_convenient poor
 - Catering extremely poor
 - Platform_location manageable
 - Onboard_entertainment extremely poor
 - TypeTravel Personal Travel
 - Travel_Class Eco


In [None]:
similar_cleanliness = merged_df.query("Overall_Experience == 1 & Seat_comfort == 'extremely poor' & Seat_Class == 'Green Car' & Arrival_time_convenient == 'poor' & Catering == 'extremely poor' & Platform_location == 'manageable' & Onboard_entertainment == 'extremely poor' & TypeTravel == 'Personal Travel' & Travel_Class == 'Eco'")

In [None]:
similar_cleanliness['Cleanliness'].value_counts()

In [None]:
similar_cleanliness.dropna(subset=['Cleanliness'])[['Online_support', 'Onboardwifi_service', 'Age', 'Cleanliness']]

In [None]:
tech_cleanliness = transformed_df[['Online_support', 'Onboardwifi_service', 'Cleanliness']]
sns.barplot(data=tech_cleanliness, x='Online_support', y='Cleanliness', hue='Onboardwifi_service', palette=our_colors)
plt.show()

In [None]:
merged_df[merged_df['Cleanliness'].isna()][['Online_support', 'Onboardwifi_service', 'Cleanliness']]

In [None]:
gender_age_cleanliness = transformed_df[['Age', 'Gender', 'Cleanliness']]
gender_age_cleanliness['Age'] = pd.cut(merged_df['Age'], 5, labels = ['25', '35', '45', '60', '80'])
sns.barplot(data=gender_age_cleanliness, x='Age', y='Cleanliness', hue='Gender')
plt.show()

**Conclusion:** 
From exploring the dataset, it seems that:
 - Cleanliness rating is independent of Gender and Age.
 - Cleanliness depends more on tech services, like wifi and online support.
 - Similar reviews to the ones that need to be imputed are Acceptable/Good in a 1:1 ratio.
 - Based on the bar chart comparing Cleanliness values per wifi and online support rating, I'd suggest imputing 'good' for all but one missing value (the one is a combination of 'need improvement' for online support and 'acceptable' for wifi).

### Imputing Online boarding

In [None]:
merged_df[merged_df['Online_boarding'].isna()]

In [None]:
similar_boarding = merged_df.query("Overall_Experience == 1 & Seat_comfort == 'extremely poor' & Seat_Class == 'Green Car' & Arrival_time_convenient == 'poor' & Catering == 'extremely poor' & Platform_location == 'manageable' & Onboard_entertainment == 'extremely poor' & TypeTravel == 'Personal Travel' & Travel_Class == 'Eco'")

In [None]:
similar_boarding['Online_boarding'].value_counts()

In [None]:
tech_boarding = transformed_df[['Online_support', 'Onboardwifi_service', 'Online_boarding']]
sns.barplot(data=tech_boarding, x='Online_support', y='Online_boarding', hue='Onboardwifi_service', palette=our_colors)
plt.show()

In [None]:
merged_df[merged_df['Online_boarding'].isna()][['Online_support', 'Onboardwifi_service', 'Online_boarding']]

In [None]:
gender_age_boarding = transformed_df[['Age', 'Gender', 'Online_boarding']]
gender_age_boarding['Age'] = pd.cut(merged_df['Age'], 5, labels = ['25', '35', '45', '60', '80'])
sns.barplot(data=gender_age_boarding, x='Age', y='Online_boarding', hue='Gender')
plt.show()

**Conclusions**:
 - Online boarding depends highly on other tech services.
 - Doesn't depend that much on age or gender.
 - I'd suggest an imputation based on the bar chart of other tech services: 'excellent' for both excellent rows, 'poor' for both poor rows, 'good' for the good row and 'acceptable' for the remaining row.

### Imputing Onboard entertainment

In [None]:
sns.clustermap(transformed_df.corr(), cmap="rocket_r")
plt.show()

In [None]:
sns.barplot(data=transformed_df[['Online_support', 'Seat_comfort', 'Onboard_entertainment']], x='Online_support', y='Onboard_entertainment', hue='Seat_comfort', palette=our_colors)
plt.show()

In [None]:
fun_df = merged_df[['Overall_Experience', 'Online_support', 'Seat_comfort']].dropna()
fun_df['Onboard_entertainment'] = merged_df['Onboard_entertainment']

fun_imputed = impute_missing_ordinal_records(fun_df, list(fun_df.columns), n_classes=5)
fun_imputed

**Conclusions**:
 - Onboard entertainment is correlated with Overall experience, Online Support and Seat Comfort
 - Imputation can use all three parameters to fill in the missing values
 - An imputer of categorical ordinal data was created

### Imputing Platform location

In [None]:
merged_df['Platform_location'].value_counts()

In [None]:
sns.clustermap(transformed_df.corr(), cmap="rocket_r")
plt.show()

**Heatmap** for `transformed_df`.

In [None]:
sns.heatmap(transformed_df.corr(), cmap=CMAP1)
plt.show()

Analyzing the heatmap we can conclude:
  - Strong correlation between `Overall_Experience` and `Onboard_entertainment`, `Onlinebooking_Ease`, `Onboard_service`, `Online_support`.
    - I want to analyze this further.

In [None]:
sns.barplot(data=transformed_df, x='Overall_Experience', y='Onboard_entertainment', hue='Onlinebooking_Ease', palette=our_colors)
plt.show()

In [None]:
sns.barplot(data=transformed_df, x='Arrival_time_convenient', y='Platform_location', hue='Catering', palette=our_colors)
plt.show()

**Conclusions:**
 - Platform location has new set of categorie: 'very convinient', 'Convinient', 'manageable', 'need improvement', 'Inconvinient', 'very inconvinient'
 - Platform location is correlated with Arrival time convenience and Catering
 - Imputer was created to impute missing data