# Setup

Importing the Necessary Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import preprocessing

ModuleNotFoundError: No module named 'pandas'

In [None]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

Data import

In [None]:
airline_data = pd.read_csv("data/test.csv")
airline_data.head(6)

# Data Cleanup

In [None]:
columns_to_drop = ['Unnamed: 0', 'id']
airline_data = airline_data.drop(columns=columns_to_drop, axis=1)
airline_data.head(6)

I dropped unecessary colomns such as id and numeric-index

In [None]:
print(airline_data.shape)
print(airline_data.dtypes)

In [None]:
airline_data.isnull().values.any()

In [None]:
airline_data.isnull().sum()

Over here, I can see that there are some null values for Arrival delay in minutes. Normally we would remove all null rows, but intuition suggests that null might equal to 0 or no delay. So, I performed a data check, but after checking the data, it was found that there are indeed some values where 0 was written. Therefore Null does not necessarily equal 0. Therefore, since kaggle does not have the right data information provided, those rows with null values, needs to be dropped.

In [None]:
airline_data = airline_data.dropna(subset=['Arrival Delay in Minutes'])
airline_data.isnull().values.any()

The data is now clean, now I will divide the features and the label, in different data-frames, to be used later

In [None]:
airline_features = airline_data.iloc[:,0:22] # Independent variables
airline_satisfaction = airline_data['satisfaction'] # Outcome variable
airline_features.head()

In [None]:
airline_satisfaction.head()

The data has been cleaned now

# Exploratory Data Analysis

The central goal for EDA would be to examine the overall dataset and perform feature reduction and data inspection, to build a clean model. To perform this, I will follow the following steps, in this order to get the best result:

## Analysis for Target

In [None]:
sns.countplot(x='satisfaction', data=airline_data)

This plot reveals that the final outcome variable of satisfaction is almost equally distributed. Therefore, no new false dummy value creation of sorts is required

## Univariate Analysis

Goal: I will examine each feature individually to understand its distribution, variability, and potential for feature reduction.

### "Continuous Numerical Features" Data Analysis

In [None]:
# Set a more visually appealing theme
sns.set_style("whitegrid")

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
plt.subplots_adjust(hspace=0.4, wspace=0.3)

sns.histplot(airline_data['Age'], color='#2ecc71', fill=True, ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

sns.histplot(airline_data['Flight Distance'], color='#3498db', fill=True, ax=axes[0, 1])
axes[0, 1].set_title('Flight Distance Distribution')
axes[0, 1].set_xlabel('Distance (km)')
axes[0, 1].set_ylabel('Frequency')

sns.histplot(airline_data['Departure Delay in Minutes'], color='#2ecc71', fill=True, ax=axes[1, 0])
axes[1, 0].set_title('Departure Delay Distribution')
axes[1, 0].set_xlabel('Delay (minutes)')
axes[1, 0].set_ylabel('Frequency')

sns.histplot(airline_data['Arrival Delay in Minutes'], color='#3498db', fill=True, ax=axes[1, 1])
axes[1, 1].set_title('Arrival Delay Distribution')
axes[1, 1].set_xlabel('Delay (minutes)')
axes[1, 1].set_ylabel('Frequency')

fig.suptitle('Airline Numeric Data Histograms', fontsize=16)

# Show the plot
plt.show()


In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# Use axes[0] and axes[1] to call scatter on each subplot
axes[0].scatter(x=airline_data.index, y=airline_data['Arrival Delay in Minutes'], color='#3498db', alpha=0.6)
axes[0].set_title('Arrival Delay Distribution (Plot 1)')
axes[0].set_xlabel('Index')
axes[0].set_ylabel('Arrival Delay in Minutes')

axes[1].scatter(x=airline_data.index, y=airline_data['Departure Delay in Minutes'], color='#3498db', alpha=0.6)
axes[1].set_title('Departure Delay Distribution (Plot 2)')
axes[1].set_xlabel('Index')
axes[1].set_ylabel('Departure Delay in Minutes')

# Adjust spacing between the plots
plt.tight_layout()

# Show the plot
plt.show()


### "Discrete Numerical Features" Data Analysis

In [None]:
sns.set_style("whitegrid")

fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(14, 20))
plt.subplots_adjust(hspace=0.6, wspace=0.4)
palette = sns.color_palette("Set2")

# List of attributes to plot and their titles
attributes = [
    ('Inflight wifi service', 'Inflight Wifi Service'),
    ('Departure/Arrival time convenient', 'Departure/Arrival Time Convenient'),
    ('Ease of Online booking', 'Ease of Online Booking'),
    ('Gate location', 'Gate Location'),
    ('Food and drink', 'Food and Drink'),
    ('Online boarding', 'Online Boarding'),
    ('Seat comfort', 'Seat Comfort'),
    ('Inflight entertainment', 'Inflight Entertainment'),
    ('On-board service', 'On-board Service'),
    ('Leg room service', 'Leg room Service'),
    ('Baggage handling', 'Baggage Handling'),
    ('Checkin service', 'Check-in Service'),
    ('Inflight service', 'Inflight Service'),
    ('Cleanliness', 'Cleanliness')
]

# Loop through each attribute and corresponding axis to plot
for i, (attr, title) in enumerate(attributes):
    row = i // 2
    col = i % 2
    sns.histplot(airline_data[attr], color=palette[i % len(palette)], fill=True, ax=axes[row][col])
    axes[row][col].set_title(title)
    axes[row][col].set_xlabel('Rating')
    axes[row][col].set_ylabel('Frequency')

fig.suptitle('Customer Satisfaction Ratings Across Various Categories', fontsize=16, y=1.02)
plt.show()

## Multivariate Analysis

Goal: This part of the analysis includes looking at the correlation between multiple variables, so that I may remove some, to prevent over-fitting

In [None]:
plt.figure(figsize=(12, 8))  # Set the figure size (width, height in inches)

sns.heatmap(airline_features.corr(), 
            cmap="YlGnBu",                # Colormap for vibrant colors
            annot=True,                   # Annotate with correlation values
            fmt=".2f",                    # Format values to 2 decimal places
            linewidths=0.5,               # Add space between cells
            annot_kws={"size": 10},       # Set annotation font size
            cbar_kws={"shrink": 0.8})     # Shrink color bar for better fit

plt.title('Correlation Matrix of Airline Features', fontsize=16, pad=15)
plt.show()


Based on the analysis, in a sandbox enviornment, these values were looked into and started dropping for feature selection. 

## Sandbox Testing of Each variable with the correlation pairs

In [None]:
#Outliers have been removed for visual ease of interpretation, as it was observed that majority of the outliers were clustered for lower values.

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# Boxplot for Departure Delay vs Satisfaction without outliers
sns.boxplot(x='satisfaction', y='Departure Delay in Minutes', data=airline_data, ax=axes[0], showfliers=False)
axes[0].set_title('Departure Delay vs Satisfaction (No Outliers)')
axes[0].set_xlabel('Satisfaction')
axes[0].set_ylabel('Departure Delay in Minutes')

# Calculate statistics for Departure Delay
dep_delay_stats = airline_data.groupby('satisfaction')['Departure Delay in Minutes'].describe()

# Print statistics to console
print("Departure Delay Statistics by Satisfaction Level:")
for satisfaction, stats in dep_delay_stats.iterrows():
    print(f"{satisfaction}: Mean: {stats['mean']:.1f}, Median: {stats['50%']:.1f}, Q1: {stats['25%']:.1f}, Q3: {stats['75%']:.1f}")

# Boxplot for Arrival Delay vs Satisfaction without outliers
sns.boxplot(x='satisfaction', y='Arrival Delay in Minutes', data=airline_data, ax=axes[1], showfliers=False)
axes[1].set_title('Arrival Delay vs Satisfaction (No Outliers)')
axes[1].set_xlabel('Satisfaction')
axes[1].set_ylabel('Arrival Delay in Minutes')

# Calculate statistics for Arrival Delay
arr_delay_stats = airline_data.groupby('satisfaction')['Arrival Delay in Minutes'].describe()

# Print statistics to console
print("\nArrival Delay Statistics by Satisfaction Level:")
for satisfaction, stats in arr_delay_stats.iterrows():
    print(f"{satisfaction}: Mean: {stats['mean']:.1f}, Median: {stats['50%']:.1f}, Q1: {stats['25%']:.1f}, Q3: {stats['75%']:.1f}")

# Show the plots
plt.tight_layout()
plt.show()


Analysis: Both of them seem to be of similar distribution and have similar impact on satisfaction. Therefore, either can be chosen. As Arrival delay tends to have slightly lesser outliers, that was chosen for the model. Chosen variable: "Arrival Delay in Minutes"

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 12))
plt.subplots_adjust(hspace=0.4, wspace=0.4)

sns.boxplot(x='satisfaction', y='Inflight wifi service', data=airline_data, ax=axes[0][0])
axes[0][0].set_title('Inflight wifi service')

sns.boxplot(x='satisfaction', y='Ease of Online booking', data=airline_data, ax=axes[0][1]) 
axes[0][1].set_title('Ease of Online booking')

sns.boxplot(x='satisfaction', y='Gate location', data=airline_data, ax=axes[1][0]) 
axes[1][0].set_title('Gate location')

sns.boxplot(x='satisfaction', y='Departure/Arrival time convenient', data=airline_data, ax=axes[1][1]) 
axes[1][1].set_title('Departure/Arrival time convenient')


plt.show()

Based on the plots, I could see that "Ease of online booking" and "In flight wifi service" are better predicters as they have different distribution of values for satisifed vs disatisified. For example for both of them, people who are neutral or disatisfied have voted primarily between 2-3. While people who are satisified have a range form 2-4, and an even greater spread across all values. Therefore both of them were chosen and kept. While the other two were dropped.

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 12))
plt.subplots_adjust(hspace=0.4, wspace=0.4)

sns.boxplot(x='satisfaction', y='Food and drink', data=airline_data, ax=axes[0])
axes[0].set_title('Food and drink')

sns.boxplot(x='satisfaction', y='Seat comfort', data=airline_data, ax=axes[1]) 
axes[1].set_title('Seat comfort')

sns.boxplot(x='satisfaction', y='Inflight entertainment', data=airline_data, ax=axes[2]) 
axes[2].set_title('Inflight entertainment')

plt.show()

The first element dropped out of the following is "food and drink", as compared to the other two variables, it is not the most strongest predictor, as the distribution for disatisfied and satisfied is similar. The other two were kept.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 12))
plt.subplots_adjust(hspace=0.4, wspace=0.4)

sns.boxplot(x='satisfaction', y='Food and drink', data=airline_data, ax=axes[0][0])
axes[0][0].set_title('Food and drink')

sns.boxplot(x='satisfaction', y='Cleanliness', data=airline_data, ax=axes[0][1]) 
axes[0][1].set_title('Cleanliness')

sns.boxplot(x='satisfaction', y='Seat comfort', data=airline_data, ax=axes[1][0]) 
axes[1][0].set_title('Seat comfort')

sns.boxplot(x='satisfaction', y='Inflight entertainment', data=airline_data, ax=axes[1][1]) 
axes[1][1].set_title('Inflight entertainment')


plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 12))
plt.subplots_adjust(hspace=0.4, wspace=0.4)

sns.boxplot(x='satisfaction', y='On-board service', data=airline_data, ax=axes[0])
axes[0].set_title('On-board service')

sns.boxplot(x='satisfaction', y='Baggage handling', data=airline_data, ax=axes[1]) 
axes[1].set_title('Baggage handling')

sns.boxplot(x='satisfaction', y='Inflight service', data=airline_data, ax=axes[2]) 
axes[2].set_title('Inflight service')

plt.show()

In [None]:
columns_to_drop = ['Departure Delay in Minutes', 'Gate location', 'Departure/Arrival time convenient', 'Food and drink', 'Online boarding', 'Cleanliness', 'Food and drink', 'Seat comfort', 'On-board service', 'Inflight wifi service']
airline_data = airline_data.drop(columns=columns_to_drop, axis=1)
airline_data.head()

In [None]:
airline_features = airline_data.iloc[:,0:11] # Independent variables

plt.figure(figsize=(12, 8))

sns.heatmap(airline_features.corr(), 
            cmap="YlGnBu",                # Colormap for vibrant colors
            annot=True,                   # Annotate with correlation values
            fmt=".2f",                    # Format values to 2 decimal places
            linewidths=0.5,               # Add space between cells
            annot_kws={"size": 10},       # Set annotation font size
            cbar_kws={"shrink": 0.8})     # Shrink color bar for better fit

plt.title('Correlation Matrix of Airline Features', fontsize=16, pad=15)
plt.show()

## Encoding with Catagorical Values

In [None]:
airline_features_enc = pd.get_dummies(airline_features, drop_first=True)

In [None]:
airline_features_enc.head()

In [None]:
print(airline_features_enc.shape)
print(airline_features_enc.dtypes)

In [None]:
airline_features = airline_data.iloc[:,0:11] # Independent variables

plt.figure(figsize=(12, 8))

sns.heatmap(airline_features_enc.corr(), 
            cmap="YlGnBu",                # Colormap for vibrant colors
            annot=True,                   # Annotate with correlation values
            fmt=".2f",                    # Format values to 2 decimal places
            linewidths=0.5,               # Add space between cells
            annot_kws={"size": 10},       # Set annotation font size
            cbar_kws={"shrink": 0.8})     # Shrink color bar for better fit

plt.title('Correlation Matrix of Airline Features', fontsize=16, pad=15)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
target = airline_data["satisfaction"]
print(target.head())

label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target)
print(target)

# Decision Tree

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree

X = airline_features_enc
y = airline_data["satisfaction"]

I have stored the variables in x and y, for easier calculation.

I am using a standard scaler to scale the values of X and transforming it, to be used for the model

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

I have built the decision tree model and stored it in clf, named variable. The random state allows that each time I run, the output tends to be the same. The class weight allows me to fix the earlier problem, identified during EDA that targer variable count is not balanced

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(random_state=42, class_weight="balanced")

I will be using grid search, with a cv of 5 to find the optimal values for Decision tree. 

I have now initialised the paramater grid values for the decision tree. ***Max Depth*** ensures how deep the tree would make, starting from None, indicating no limit and testing with low to high level complexity for the tree (5 to 20). By using this, we are making sure that whether a shallow tree is sufficient or more depth is required. Next looking at the ***Min samples split***, has the lowest value of 2, indicating the tree to split a node as long there are at least two samples in it. WHen looking at larger values like 5 or 10, this adds a constraint to split at least 5/10 times, which enforces even more conservative splits. For ***Min Sample Leaf***, when looking at the value of 1 shows that the leaf nodes to have just one sample, which enables the tree to grow without much restricictions. Values 2 and 4, introduce some constratint to the leaf noes must have atleast 2 or 4 samples, respectively. This smooths the model in some cases by preventing it from creating very small, potentially unreliable leaves.

In [None]:
param_grid = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

In [None]:
decision_tree = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=42
)

decision_tree.fit(X_scaled, y)

y_pred = decision_tree.predict(X_scaled)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

accuracy = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred, pos_label="satisfied")  
precision = precision_score(y, y_pred, pos_label="satisfied")
recall = recall_score(y, y_pred, pos_label="satisfied")
conf_matrix = confusion_matrix(y, y_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y, y_pred, target_names=['neutral or dissatisfied', 'satisfied']))

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc

# Encode the labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test) 

# Predict probabilities for ROC
y_prob = decision_tree.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.title('ROC Curve for Decision Tree')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()

In [None]:
# Visualize the decision tree
plt.figure(figsize=(70, 30))
plot_tree(
    decision_tree, 
    feature_names=X.columns, 
    class_names=['No', 'Yes'], 
    filled=True, 
    rounded=True, 
    fontsize=10
)
plt.title("Decision Tree Visualization")
plt.show()

# Random Forest Model

I have imported the Random Forest Classifier from sklearn's ensemble module, which is a learning method that operates by constructing multiple decision trees during training. I maintained consistency with our previous analysis by using the same 80-20 train-test split and random state of 42. This ensures we can make fair comparisons between our models.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree

X = airline_features_enc
y = airline_data["satisfaction"]

I have stored the variables in x and y, for easier calculation.

I am using a standard scaler to scale the values of X and transforming it, to be used for the model

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

I have built the decision tree model and stored it in clf, named variable. The random state allows that each time I run, the output tends to be the same. The class weight allows me to fix the earlier problem, identified during EDA that targer variable count is not balanced

In [None]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(random_state=42)

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print(best_rf)

For the Random Forest model, I implemented a grid search over multiple hyperparameters. The n_estimators parameter tests varying numbers of trees from 10 to 200, allowing us to find the sweet spot between computational efficiency and model robustness. The max_depth values mirror our decision tree analysis, while min_samples_split and min_samples_leaf parameters help control the complexity of individual trees within the forest. By using GridSearchCV with 5-fold cross-validation, it ensures that the parameter selection is robust and generalizable.  

In [None]:
# Initialize the Random Forest Classifier with the specified parameters
rf_model = RandomForestClassifier(
    n_estimators = 200,
    max_depth=20,  
    min_samples_split = 10,
    random_state=42
)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

# Calculate metrics using y_test and y_pred
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label="satisfied")  
precision = precision_score(y_test, y_pred, pos_label="satisfied")
recall = recall_score(y_test, y_pred, pos_label="satisfied")
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['neutral or dissatisfied', 'satisfied']))

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # 'neutral or dissatisfied' -> 0, 'satisfied' -> 1

# Predict probabilities for ROC
y_prob = decision_tree.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.title('ROC Curve for Decision Tree')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Bagging Ensemble

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(airline_features_enc)
y = airline_data["satisfaction"]

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize Bagging Classifier
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(random_state=42), random_state=42)

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

# Grid search for best parameters
grid_search = GridSearchCV(estimator=bagging, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best Bagging Classifier
best_bagging = grid_search.best_estimator_
print(best_bagging)

# Train the best Bagging model
best_bagging.fit(X_train, y_train)

# Predictions
y_pred = best_bagging.predict(X_test)

In [None]:
# Metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label="satisfied")  
precision = precision_score(y_test, y_pred, pos_label="satisfied")
recall = recall_score(y_test, y_pred, pos_label="satisfied")
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['neutral or dissatisfied', 'satisfied']))

In [None]:
# Encode the labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # 'neutral or dissatisfied' -> 0, 'satisfied' -> 1

# Predict probabilities for ROC
y_prob = best_bagging.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.title('ROC Curve for Bagging Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Boosting Ensemble

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(airline_features_enc)
y = airline_data["satisfaction"]

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize AdaBoost Classifier
boosting = AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=42), random_state=42)

In [None]:
# Define hyperparameter grid for AdaBoost
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'estimator__max_depth': [1, 2, 3, None]
}

# Grid search for best parameters
grid_search = GridSearchCV(estimator=boosting, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best AdaBoost Classifier
best_boosting = grid_search.best_estimator_
print(best_boosting)

In [None]:
# Train the best AdaBoost model
best_boosting.fit(X_train, y_train)

# Predictions
y_pred = best_boosting.predict(X_test)

In [None]:
ccuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label="satisfied")  
precision = precision_score(y_test, y_pred, pos_label="satisfied")
recall = recall_score(y_test, y_pred, pos_label="satisfied")
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['neutral or dissatisfied', 'satisfied']))

In [None]:
# Encode the labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # 'neutral or dissatisfied' -> 0, 'satisfied' -> 1

# Predict probabilities for ROC
y_prob = best_boosting.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.title('ROC Curve for AdaBoost Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Gaussian Naive Bayes Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

X = airline_features_enc
y = airline_data["satisfaction"]

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=23)

# Gaussian Naive Bayes classifier
GNBclassifier = GaussianNB()

# Training with cross-validation on the training set
cv_scores = cross_val_score(GNBclassifier, X_train, y_train, cv=5)  # 5-fold cross-validation

# Fitting the model
GNBmodel = GNBclassifier.fit(X_train, y_train)

# Making predictions on the test set
GNBpreds = GNBmodel.predict(X_test)

# Calculating precision and recall
precision = precision_score(y_test, GNBpreds, pos_label="satisfied")  
recall = recall_score(y_test, GNBpreds, pos_label="satisfied") 
f1 = f1_score(y_test, GNBpreds, pos_label="satisfied")

# Printing cross-validation accuracy and test accuracy
print(f"Cross-validation training accuracy (mean): {cv_scores.mean() * 100:.2f}")
print(f"Training accuracy on full training set: {GNBmodel.score(X_train, y_train) * 100:.2f}")
print(f"Testing accuracy: {accuracy_score(y_test, GNBpreds) * 100:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"f1: {f1:.2f}")
print(confusion_matrix(y_test, GNBpreds))

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Encode the labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # 'neutral or dissatisfied' -> 0, 'satisfied' -> 1

# Predict probabilities for ROC
y_prob = GNBmodel.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.title('ROC Curve for Gaussian Naive Bayes Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Linear Discrimininant Model

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

X = airline_features_enc
y = airline_data["satisfaction"]

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=23)

# Linear Discriminant Analysis (LDA)
LDAmodel = LinearDiscriminantAnalysis()

# Cross-validation for LDA on the training set
cv_scores_lda = cross_val_score(LDAmodel, X_train, y_train, cv=5)

# Fitting the model
LDAmodel.fit(X_train, y_train)

# Making predictions on the test set
LDApreds = LDAmodel.predict(X_test)

# Printing cross-validation accuracy and test accuracy for LDA
print(f"Cross-validation training accuracy (LDA mean): {cv_scores_lda.mean() * 100:.2f}%")
print(f"Training accuracy (LDA) on full training set: {LDAmodel.score(X_train, y_train) * 100:.2f}%")
print(f"Testing accuracy (LDA): {accuracy_score(y_test, LDApreds) * 100:.2f}%")

# Calculating precision, recall, and F1 score
precision = precision_score(y_test, LDApreds, pos_label="satisfied")  # Adjust pos_label as needed
recall = recall_score(y_test, LDApreds, pos_label="satisfied")        # Adjust pos_label as needed
f1 = f1_score(y_test, LDApreds, pos_label="satisfied")                # Adjust pos_label as needed

print(f"Precision (LDA): {precision:.2f}")
print(f"Recall (LDA): {recall:.2f}")
print(f"F1 Score (LDA): {f1:.2f}")

# Confusion matrix for LDA
print("Confusion Matrix (LDA):")
conf_matrix = confusion_matrix(y_test, LDApreds)
print(conf_matrix)


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Encode the labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # 'neutral or dissatisfied' -> 0, 'satisfied' -> 1

# Predict probabilities for ROC
y_prob = LDAmodel.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.title('ROC Curve for Gaussian Naive Bayes Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Quadratic Discriminant Model

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

X = airline_features_enc
y = airline_data["satisfaction"]


# Quadratic Discriminant Analysis (QDA)
QDAmodel = QuadraticDiscriminantAnalysis()

# Cross-validation for QDA on the training set
cv_scores_qda = cross_val_score(QDAmodel, X_train, y_train, cv=5)

# Fitting the model
QDAmodel.fit(X_train, y_train)

# Making predictions on the test set
QDApreds = QDAmodel.predict(X_test)


# Printing cross-validation accuracy and test accuracy for LDA
print(f"Cross-validation training accuracy (LDA mean): {cv_scores_lda.mean() * 100:.2f}%")
print(f"Training accuracy (LDA) on full training set: {LDAmodel.score(X_train, y_train) * 100:.2f}%")
print(f"Testing accuracy (LDA): {accuracy_score(y_test, LDApreds) * 100:.2f}%")

# Calculating precision, recall, and F1 score
precision = precision_score(y_test, LDApreds, pos_label="satisfied")  # Adjust pos_label as needed
recall = recall_score(y_test, LDApreds, pos_label="satisfied")        # Adjust pos_label as needed
f1 = f1_score(y_test, LDApreds, pos_label="satisfied")                # Adjust pos_label as needed

print(f"Precision (LDA): {precision:.2f}")
print(f"Recall (LDA): {recall:.2f}")
print(f"F1 Score (LDA): {f1:.2f}")

# Confusion matrix for LDA
print("Confusion Matrix (LDA):")
conf_matrix = confusion_matrix(y_test, LDApreds)
print(conf_matrix)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Encode the labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # 'neutral or dissatisfied' -> 0, 'satisfied' -> 1

# Predict probabilities for ROC
y_prob = QDAmodel.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.title('ROC Curve for Gaussian Naive Bayes Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

X = airline_features_enc
y = airline_data["satisfaction"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

I imported the KNeighborsClassifier from sklearn and set up the data split to maintain consistency with previous models. This ensures that the KNN model is evaluated on the same training and testing sets, allowing for fair comparisons.

In [None]:
knn = KNeighborsClassifier()

nn_list=list(range(1,25))
print(nn_list)
param_grid = {
    'n_neighbors': nn_list,
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'cosine']
}
print(param_grid)

I initialized the KNN model and defined a parameter grid for hyperparameter tuning. The grid includes a range of `n_neighbors` from 1 to 24, and tests different `weights` ('uniform' and 'distance') and `metrics` ('euclidean', 'manhattan', 'cosine') to find the best configuration. The neighbors parameter is the most important one, as it determines how many nearest neighbors are considered for classification. A higher number of neighbors smooths the decision boundary, while a lower number makes it more complex. Howeever, since the data is linearly separable, the model is not required to have a high number of neighbors. When looking at the weights, the uniform weights are used when all points in each neighborhood are weighted equally, while the distance weights are used when the points are weighted by the inverse of their distance. The metric parameter determines the distance metric used to find nearest neighbors. Euclidian distanace is the most commonly used metric, as it is the most intuitive and straightforward. However, manhattan distance is also a good choice, as it is more robust to outliers and is easier to understand. Cosine distance is used when the data is sparse and the angle between vectors is more important than their magnitude. 

In [None]:
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_
print(f"Best KNN Score: {grid_search.best_score_ * 100:.2f}%")
print("Best Parameters:", grid_search.best_params_)

Using GridSearchCV, I performed a thorough search over the parameter grid with 5-fold cross-validation. This process identifies the best combination of parameters for the KNN model, ensuring it is well-tuned for the dataset. As expected, the best score was found when using 1 neighbor, uniform weights and euclidean distance, as it is the simplest model and the data is linearly separable. 

In [None]:
# Initialize the Random Forest Classifier with the specified parameters
knn = KNeighborsClassifier(
    metric="manhattan",
    n_neighbors=17,
    weights="distance",
)

# Train the model on the training data
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

# Calculate metrics using y_test and y_pred
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label="satisfied")  
precision = precision_score(y_test, y_pred, pos_label="satisfied")
recall = recall_score(y_test, y_pred, pos_label="satisfied")
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['neutral or dissatisfied', 'satisfied']))

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # 'neutral or dissatisfied' -> 0, 'satisfied' -> 1

# Predict probabilities for ROC
y_prob = knn.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.title('ROC Curve for KNN')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Clusturing using K-Means

In [None]:
airline_data.head()

In [None]:
airline_features_clust = airline_data.drop("satisfaction", axis=1)
airline_features_clust.head()

In [None]:
import pandas as pd

# Assuming airline_features_clust is your DataFrame
categorical_columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

# Perform dummy encoding
airline_data_num = pd.get_dummies(airline_features_clust, columns=categorical_columns, drop_first=True)


# Display the resulting DataFrame
print(airline_data_num.head())

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

scaler = StandardScaler()
scaled_data = scaler.fit_transform(airline_data_num)

wcss = []
k_values = range(1, 11)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(k_values, wcss, marker='o', linestyle='--')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.grid()
plt.show()

In [None]:
silhouette_scores = []
for k in k_values[1:]:  # Silhouette score is undefined for k=1
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_data)
    score = silhouette_score(scaled_data, cluster_labels)
    silhouette_scores.append(score)

# Plot Silhouette Scores
plt.figure(figsize=(10, 6))
plt.plot(k_values[1:], silhouette_scores, marker='o', linestyle='--')
plt.title('Silhouette Scores')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid()
plt.show()

In [None]:
optimal_k = 4  # Adjust this based on observed results

# Final Clustering
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans_final.fit_predict(scaled_data)
airline_data_num['Cluster'] = cluster_labels

In [None]:
# Assuming 'airline_data_num' is your dataframe
features = airline_data_num.columns[:-1]

# Calculate the number of rows required
n_features = len(features)
n_cols = 3  # Number of plots per row
n_rows = (n_features + n_cols - 1) // n_cols  # Ceiling division to get rows

# Create subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
axes = axes.flatten()  # Flatten to easily iterate through axes

# Plot each feature
for idx, feature in enumerate(features):
    sns.barplot(
        data=airline_data_num,
        x='Cluster',
        y=feature,
        ci=None,
        estimator=np.mean,
        ax=axes[idx]
    )
    axes[idx].set_title(f'Mean {feature} by Cluster')
    axes[idx].set_ylabel(f'Average {feature}')
    axes[idx].set_xlabel('Cluster')
    axes[idx].grid()

# Hide any unused subplots
for idx in range(len(features), len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

In [None]:
# Group the data by Cluster and calculate the mean for each feature
cluster_means = airline_data_num.groupby('Cluster').mean()

# Generate the text-based report
for cluster in cluster_means.index:
    print(f"Cluster {cluster}:")
    for feature, value in cluster_means.loc[cluster].items():
        print(f"  Feature: {feature}: {value:.2f}")
    print("-" * 40)