### Model Training

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data_training.csv")

### XGBoost

In [3]:
!pip install xgboost -q

You should consider upgrading via the '/Users/asa/Desktop/ITDS/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [4]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [5]:
X = df.drop('Recommended', axis = 1)
y = df["Recommended"].map({'yes':1, 'no':0})

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [7]:
cols = list(X_train.columns)

cols

['AirlineName',
 'CabinType',
 'EntertainmentRating',
 'FoodRating',
 'GroundServiceRating',
 'OriginCountry',
 'OverallScore',
 'SeatComfortRating',
 'ServiceRating',
 'ValueRating',
 'WifiRating',
 'Day',
 'Month',
 'Year']

In [8]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'lambda': 1,
    'alpha': 0
}

In [9]:
num_round = 100

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=cols)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=cols)

model = xgb.train(params, dtrain, num_round)

In [10]:
model.save_model("model.json")

In [11]:
y_pred_prob = model.predict(dtest)
y_pred = [1 if p > 0.5 else 0 for p in y_pred_prob]

In [12]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_prob))

Accuracy: 0.9574537718867616
Precision: 0.9529644268774704
Recall: 0.9447492163009404
F1 Score: 0.9488390397481307
ROC AUC Score: 0.9917296517901104


In [13]:
import xgboost as xgb
import pandas as pd


def run(features, threshold=0.5) -> int:
    # Load the model
    loaded_model = xgb.Booster()
    loaded_model.load_model("model.json")

    # Convert the features argument to a DataFrame
    input_df = pd.DataFrame([features])

    # Convert the DataFrame to DMatrix
    cols = list(input_df.columns)
    data_dmatrix = xgb.DMatrix(input_df, feature_names=cols)

    # Make predictions
    predictions = loaded_model.predict(data_dmatrix)

    # Convert probabilities to binary outcome
    prediction_output = [1 if x >= threshold else 0 for x in predictions]
    return prediction_output[0]

In [15]:
sample = X_test.iloc[2285].to_dict()
sample_y = y_test.iloc[2285]

run(sample), sample_y

(1, 1)

In [17]:
sample

{'AirlineName': 5.742218139656313,
 'CabinType': 0.0,
 'EntertainmentRating': 0.0,
 'FoodRating': 3.0,
 'GroundServiceRating': 0.0,
 'OriginCountry': 5.367744393124137,
 'OverallScore': 9.0,
 'SeatComfortRating': 0.0,
 'ServiceRating': 0.0,
 'ValueRating': 4.0,
 'WifiRating': 0.0,
 'Day': 30.0,
 'Month': 11.0,
 'Year': 2017.0}