In [1]:
import pandas as pd

# Read the Invistico Airline dataset
df = pd.read_csv('invistico_airline.csv')


In [2]:
# Convert categorical variables to binary using get_dummies
df = pd.get_dummies(df, columns=['satisfaction', 'Customer Type', 'Type of Travel', 'Class'], drop_first=True)


In [4]:
# Convert boolean columns to integers (True -> 1, False -> 0)
boolean_columns = df.select_dtypes(include=['bool']).columns
df[boolean_columns] = df[boolean_columns].astype(int)


In [5]:
df.head()

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction_satisfied,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus
0,65,265,0,0,0,2,2,4,2,3,...,5,3,2,0,0.0,1,0,1,1,0
1,47,2464,0,0,0,3,0,2,2,3,...,2,3,2,310,305.0,1,0,1,0,0
2,15,2138,0,0,0,3,2,0,2,2,...,4,4,2,0,0.0,1,0,1,1,0
3,60,623,0,0,0,3,3,4,3,1,...,4,1,3,0,0.0,1,0,1,1,0
4,70,354,0,0,0,3,4,3,4,2,...,4,2,5,0,0.0,1,0,1,1,0


In [9]:
#drop missing values
df = df.dropna()


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Prepare feature matrix X and target vector y
X = df.drop('satisfaction_satisfied', axis=1)
y = df['satisfaction_satisfied']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the logistic regression model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

# Print model accuracy on the test set
print("Test Set Accuracy:", model.score(X_test, y_test))




Test Set Accuracy: 0.8264730867248437


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
predictions = model.predict(X_test)

# Create confusion matrix using pandas crosstab
conf_mat = pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'])
print("Confusion Matrix:")
print(conf_mat)

# Extract values from the confusion matrix
TP = conf_mat.loc[1,1] if (1 in conf_mat.index and 1 in conf_mat.columns) else 0
FP = conf_mat.loc[0,1] if (0 in conf_mat.index and 1 in conf_mat.columns) else 0
FN = conf_mat.loc[1,0] if (1 in conf_mat.index and 0 in conf_mat.columns) else 0
TN = conf_mat.loc[0,0] if (0 in conf_mat.index and 0 in conf_mat.columns) else 0

# Compute metrics
accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP+TN+FP+FN) else 0
precision = TP / (TP + FP) if (TP+FP) else 0
recall = TP / (TP + FN) if (TP+FN) else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision+recall) else 0

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")


Confusion Matrix:
Predicted     0      1
Actual                
0          9500   2321
1          2173  11904
Accuracy: 0.8264730867248437
Precision: 0.8368365553602812
Recall: 0.8456347233075229
F1 Score: 0.8412126351494594


In [16]:
# INSERT_YOUR_CODE
from xgboost import XGBClassifier
# Create and fit an XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
predictions_xgb = xgb_model.predict(X_test)

# Create confusion matrix
conf_mat_xgb = pd.crosstab(y_test, predictions_xgb, rownames=['Actual'], colnames=['Predicted'])
print("XGBoost Confusion Matrix:")
print(conf_mat_xgb)

# Extract values from the confusion matrix
TP_xgb = conf_mat_xgb.loc[1,1] if (1 in conf_mat_xgb.index and 1 in conf_mat_xgb.columns) else 0
FP_xgb = conf_mat_xgb.loc[0,1] if (0 in conf_mat_xgb.index and 1 in conf_mat_xgb.columns) else 0
FN_xgb = conf_mat_xgb.loc[1,0] if (1 in conf_mat_xgb.index and 0 in conf_mat_xgb.columns) else 0
TN_xgb = conf_mat_xgb.loc[0,0] if (0 in conf_mat_xgb.index and 0 in conf_mat_xgb.columns) else 0

# Compute metrics
accuracy_xgb = (TP_xgb + TN_xgb) / (TP_xgb + TN_xgb + FP_xgb + FN_xgb) if (TP_xgb + TN_xgb + FP_xgb + FN_xgb) else 0
precision_xgb = TP_xgb / (TP_xgb + FP_xgb) if (TP_xgb + FP_xgb) else 0
recall_xgb = TP_xgb / (TP_xgb + FN_xgb) if (TP_xgb + FN_xgb) else 0
f1_score_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb) if (precision_xgb + recall_xgb) else 0

# Print the results
print(f"XGBoost Accuracy: {accuracy_xgb}")
print(f"XGBoost Precision: {precision_xgb}")
print(f"XGBoost Recall: {recall_xgb}")
print(f"XGBoost F1 Score: {f1_score_xgb}")



XGBoost Confusion Matrix:
Predicted      0      1
Actual                 
0          11400    421
1            686  13391
XGBoost Accuracy: 0.9572553865163334
XGBoost Precision: 0.9695192586156965
XGBoost Recall: 0.9512680258577823
XGBoost F1 Score: 0.9603069310480835


Parameters: { "use_label_encoder" } are not used.

