In [3]:
## Imports
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import pickle
warnings.simplefilter(action='ignore')

In [2]:
## Load training data
train = pd.read_csv('train_preprocessed.csv')
labels = pd.read_csv('train_labels.csv')

train['session_id'] = train['session_id'].astype(str)
labels['session'] = labels['session'].astype(str)

In [None]:
## Initialize dictionaries to store models and best parameters for each question for each model type
stored_models = {}
best_params_dict = {}

In [None]:
## Function to train models and evaluate on validation data
def train_model(train, labels, model_name, model_type, hyperparams, random_search):
    ## Initialize vars to store model scores
    overall_acc, overall_f1 = 0, 0

    ## Initialize array to store best parameters for each question model
    best_params_list = []

    ## Print model name
    # print("model:", model_name)
    
    ## Train model for each of the 18 questions
    for q_no in range(1, 19):
        if q_no <= 3:
            select = train[train['level_group'] == '0-4']
        elif q_no <= 13:
            select = train[train['level_group'] == '5-12']
        else:
            select = train[train['level_group'] == '13-22']

        ## Create train and validation split
        train_X, validate_X = train_test_split(select, test_size=0.2)
        train_y = labels.loc[(labels['session'].isin(train_X['session_id'])) & (labels['q'] == q_no)]
        validate_y = labels.loc[(labels['session'].isin(validate_X['session_id'])) & (labels['q'] == q_no)]

        train_X = train_X.drop(['session_id', 'level_group'], axis=1)
        train_y = train_y['correct']

        validate_X = validate_X.drop(['session_id', 'level_group'], axis=1)
        validate_y = validate_y['correct']

        ## Instantiate the model
        model = model_type

        ## If boolean random_search is true
        if random_search:
            ## Create the RandomizedSearchCV object
            rs = RandomizedSearchCV(model, hyperparams, n_iter=20, cv=3, scoring='f1', n_jobs=-1, verbose=0, random_state=42)

            ## Fit the RandomizedSearchCV object to the data
            rs.fit(train_X, train_y)

            ## Obtain the best scoring hyperparameters
            best_params = rs.best_params_
            best_params_list.append(best_params)

            ## Uncomment if you want to see the best parameters for each question
            print("Best hyperparameters for question", q_no, ":", best_params)

            ## Get the best estimator
            model = rs.best_estimator_
            
        model.fit(train_X, train_y)

        if model_name in stored_models:
            stored_models[model_name].append(pickle.dumps(model))
        else:
            stored_models[model_name] = [pickle.dumps(model)]

        ## Calculate the accuracy and F1 score using cross_val_score
        question_acc = np.mean(cross_val_score(model, validate_X, validate_y, cv=5))
        question_f1 = np.mean(cross_val_score(model, validate_X, validate_y, cv=5, scoring='f1'))
        overall_acc += question_acc
        overall_f1 += question_f1
        print("Question", q_no, "accuracy:", question_acc, "F1 score:", question_f1)

    overall_acc /= 18
    overall_f1 /= 18
    print("Overall accuracy:", overall_acc)
    print("Overall F1 score:", overall_f1)

    ## If randomize search executed return best params else return null
    return best_params_list if random_search else None

In [21]:
########## XGBoost ##########

## Define hyperparameter dict
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

## Set last argument to true if want to carry out randomized search for best hyperparams
res = train_model(
    train, 
    labels, 
    'XGBoost', 
    XGBClassifier(eval_metric='error', use_label_encoder=False), 
    params, 
    True)

if res:
    best_params_dict['XGBoost'] = res

Best hyperparameters for question 1: {'subsample': 1, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 1}
Question 1 accuracy: 0.7273500347853105 f1: 0.8420765776953882
Best hyperparameters for question 2: {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}
Question 2 accuracy: 0.980267610485576 f1: 0.9900354266090516
Best hyperparameters for question 3: {'subsample': 1, 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 1}
Question 3 accuracy: 0.9359220809045532 f1: 0.9669005007379587
Best hyperparameters for question 4: {'subsample': 0.8, 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 1}
Question 4 accuracy: 0.7901545188257199 f1: 0.8827222850315637
Best hyperparameters for question 5: {'subsample

In [24]:
########## RandomForest ##########

## Define hyperparameter dict
params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(1, 50),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'criterion': ['gini', 'entropy']
}

## Set last argument to true if want to carry out randomized search for best hyperparams
res = train_model(
    train, 
    labels, 
    'RandomForest', 
    RandomForestClassifier(), 
    params, 
    True)

if res:
    best_params_dict['RandomForest'] = res

Best hyperparameters for question 1: {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 12, 'min_samples_split': 8, 'n_estimators': 237}
Question 1 accuracy: 0.7235308553584012 f1: 0.8395913603637742
Best hyperparameters for question 2: {'criterion': 'entropy', 'max_depth': 37, 'min_samples_leaf': 11, 'min_samples_split': 12, 'n_estimators': 312}
Question 2 accuracy: 0.9779335048958353 f1: 0.9888436167402521
Best hyperparameters for question 3: {'criterion': 'gini', 'max_depth': 29, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 135}
Question 3 accuracy: 0.9310417806476596 f1: 0.9642896281516811
Best hyperparameters for question 4: {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 351}
Question 4 accuracy: 0.8016127325493694 f1: 0.8900139804255083
Best hyperparameters for question 5: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 19, 'min_samples_split': 7, 'n_estimators': 331}
Question 5 accura

In [1]:
########## CatBoost ##########

## Load alternate train data for catboost
cat_train = pd.read_csv('cat_train_preprocessed.csv')
cat_train['session_id'] = cat_train['session_id'].astype(str)

# Specify the categorical columns to be used by the CatBoost model
cat_features = ['event_name_mode', 'fqid_mode', 'room_fqid_mode', 'text_mode']

## Define hyperparameter dict
params = {
    'iterations': [100, 300, 500, 700],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'depth': [3, 4, 6, 8],
    'verbose': [0]
}

## Set last argument to true if want to carry out randomized search for best hyperparams
res = train_model(
    cat_train, 
    labels, 
    'CatBoost', 
    CatBoostClassifier(cat_features=cat_features, verbose=0), 
    params, 
    True)

## Add best hyperparameters to dictionary
if res:
    best_params_dict['CatBoost'] = res

Best hyperparameters for question 1: {'verbose': 0, 'learning_rate': 0.01, 'iterations': 300, 'depth': 3}
Question 1 accuracy: 0.7315933923670446 f1: 0.8449944300999128
Best hyperparameters for question 2: {'verbose': 0, 'learning_rate': 0.05, 'iterations': 100, 'depth': 8}
Question 2 accuracy: 0.978357908198301 f1: 0.9890605103363157
Best hyperparameters for question 3: {'verbose': 0, 'learning_rate': 0.01, 'iterations': 500, 'depth': 8}
Question 3 accuracy: 0.932739393857522 f1: 0.9651992717529035
Best hyperparameters for question 4: {'verbose': 0, 'learning_rate': 0.01, 'iterations': 300, 'depth': 6}
Question 4 accuracy: 0.8026734030840725 f1: 0.8905366867595094
Best hyperparameters for question 5: {'verbose': 0, 'learning_rate': 0.01, 'iterations': 500, 'depth': 4}
Question 5 accuracy: 0.5503941209448097 f1: 0.7075917013795741
Best hyperparameters for question 6: {'verbose': 0, 'learning_rate': 0.01, 'iterations': 100, 'depth': 4}
Question 6 accuracy: 0.7727562349010363 f1: 0.87181

In [4]:
########## LightGBM ##########

## Define hyperparameter dict
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 6, 10],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

## Set last argument to true if want to carry out randomized search for best hyperparams
res = train_model(
    train, 
    labels, 
    'LightGBM', 
    LGBMClassifier(), 
    params, 
    True)

if res:
    best_params_dict['LightGBM'] = res

Best hyperparameters for question 1: {'subsample': 0.8, 'n_estimators': 200, 'min_child_samples': 20, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 1}
Question 1 accuracy: 0.72501592919557 f1: 0.8404331703415556
Best hyperparameters for question 2: {'subsample': 1, 'n_estimators': 500, 'min_child_samples': 10, 'max_depth': 10, 'learning_rate': 0.2, 'colsample_bytree': 0.8}
Question 2 accuracy: 0.9794188038806448 f1: 0.9896023367008606
Best hyperparameters for question 3: {'subsample': 0.8, 'n_estimators': 200, 'min_child_samples': 30, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 1}
Question 3 accuracy: 0.9374071547417218 f1: 0.967692466878107
Best hyperparameters for question 4: {'subsample': 1, 'n_estimators': 100, 'min_child_samples': 10, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 1}
Question 4 accuracy: 0.802037135851835 f1: 0.8901188324594734
Best hyperparameters for question 5: {'subsample': 0.8, 'n_estimators': 200, 'min_child_samples': 

In [19]:
## Train ensemble with final estimator
final_estimator = LogisticRegression()

## Initialize vars to store model scores
overall_acc, overall_f1 = 0, 0

## Train model for each of the 18 questions
for q_no in range(1, 19):
    if q_no <= 3:
        select = train[train['level_group'] == '0-4']
        cat_select = cat_train[cat_train['level_group'] == '0-4']
    elif q_no <= 13:
        select = train[train['level_group'] == '5-12']
        cat_select = cat_train[cat_train['level_group'] == '5-12']
    else:
        select = train[train['level_group'] == '13-22']
        cat_select = cat_train[cat_train['level_group'] == '13-22']

    ## Create train and validation splits
    train_X, validate_X = train_test_split(select, test_size=0.2)
    cat_train_X, cat_validate_X = train_test_split(cat_select, test_size=0.2)

    train_y = labels.loc[(labels['session'].isin(train_X['session_id'])) & (labels['q'] == q_no)]
    validate_y = labels.loc[(labels['session'].isin(validate_X['session_id'])) & (labels['q'] == q_no)]

    train_X = train_X.drop(['session_id', 'level_group'], axis=1)
    cat_train_X = cat_train_X.drop(['session_id', 'level_group'], axis=1)
    train_y = train_y['correct']

    validate_X = validate_X.drop(['session_id', 'level_group'], axis=1)
    cat_validate_X = cat_validate_X.drop(['session_id', 'level_group'], axis=1)
    validate_y = validate_y['correct']

    ## Train the four models with the optimized hyperparameters
    xgb_model = XGBClassifier(**best_params_dict['XGBoost'][q_no - 1]).fit(train_X, train_y)
    rf_model = RandomForestClassifier(**best_params_dict['RandomForest'][q_no - 1]).fit(train_X, train_y)
    cat_model = CatBoostClassifier(**best_params_dict['CatBoost'][q_no - 1], cat_features=cat_features).fit(cat_train_X, train_y)
    lgbm_model = LGBMClassifier(**best_params_dict['LightGBM'][q_no - 1]).fit(train_X, train_y)

    ## Obtain the predicted probabilities for each class for each model
    rf_pred = rf_model.predict_proba(validate_X)[:, 1]
    xgb_pred = xgb_model.predict_proba(validate_X)[:, 1]
    cat_pred = cat_model.predict_proba(cat_validate_X)[:, 1]
    lgbm_pred = lgbm_model.predict_proba(validate_X)[:, 1]

    ## Stack the predictions
    stacked_predictions = np.column_stack((xgb_pred, rf_pred, cat_pred, lgbm_pred))

    ## Train the final_estimator
    final_estimator.fit(stacked_predictions, validate_y)

    ## Use the final_estimator to make the final prediction
    ensemble_pred = final_estimator.predict(stacked_predictions)
    
    model_name = 'ensemble_' + str(final_estimator)
    if model_name in stored_models:
        stored_models[model_name].append(pickle.dumps(final_estimator))
    else:
        stored_models[model_name] = [pickle.dumps(final_estimator)]

    ## Calculate the accuracy and F1 score using cross_val_score
    question_acc = np.mean(ensemble_pred == validate_y)
    question_f1 = f1_score(validate_y, ensemble_pred)
    overall_acc += question_acc
    overall_f1 += question_f1
    print("Question", q_no, "accuracy:", question_acc, "F1 score:", question_f1)

    print("Overall accuracy:", overall_acc)
    print("Overall F1 score:", overall_f1)

Question 1 accuracy: 0.7254402715892213 F1 score: 0.8408755533694049
Question 2 accuracy: 0.9813282410354339 F1 score: 0.990576140501178
Question 3 accuracy: 0.9329514109908763 F1 score: 0.9653128430296377
Question 4 accuracy: 0.7937619350732018 F1 score: 0.8850248403122781
Question 5 accuracy: 0.5461489497135582 F1 score: 0.7064635652531907
Question 6 accuracy: 0.7721196690006366 F1 score: 0.8714080459770115
Question 7 accuracy: 0.7307447485677913 F1 score: 0.8444280985656492
Question 8 accuracy: 0.61001485253554 F1 score: 0.7577754348972061
Question 9 accuracy: 0.7362614046255039 F1 score: 0.8480997189294879
Question 10 accuracy: 0.5077445363887121 F1 score: 0.5387673956262427
Question 11 accuracy: 0.6397199236155315 F1 score: 0.7802795031055901
Question 12 accuracy: 0.862295777636325 F1 score: 0.9260567392047396
Question 13 accuracy: 0.7383831954169319 F1 score: 0.0
Question 14 accuracy: 0.7158922130277955 F1 score: 0.834425621367627
Question 15 accuracy: 0.5138977296838532 F1 score

In [23]:
## Train ensemble with final estimator
final_estimator = DecisionTreeClassifier(max_depth=7)

## Initialize vars to store model scores
overall_acc, overall_f1 = 0, 0

## Train model for each of the 18 questions
for q_no in range(1, 19):
    if q_no <= 3:
        select = train[train['level_group'] == '0-4']
        cat_select = cat_train[cat_train['level_group'] == '0-4']
    elif q_no <= 13:
        select = train[train['level_group'] == '5-12']
        cat_select = cat_train[cat_train['level_group'] == '5-12']
    else:
        select = train[train['level_group'] == '13-22']
        cat_select = cat_train[cat_train['level_group'] == '13-22']

    ## Create train and validation splits
    train_X, validate_X = train_test_split(select, test_size=0.2)
    cat_train_X, cat_validate_X = train_test_split(cat_select, test_size=0.2)

    train_y = labels.loc[(labels['session'].isin(train_X['session_id'])) & (labels['q'] == q_no)]
    validate_y = labels.loc[(labels['session'].isin(validate_X['session_id'])) & (labels['q'] == q_no)]

    train_X = train_X.drop(['session_id', 'level_group'], axis=1)
    cat_train_X = cat_train_X.drop(['session_id', 'level_group'], axis=1)
    train_y = train_y['correct']

    validate_X = validate_X.drop(['session_id', 'level_group'], axis=1)
    cat_validate_X = cat_validate_X.drop(['session_id', 'level_group'], axis=1)
    validate_y = validate_y['correct']

    ## Train the four models with the optimized hyperparameters
    xgb_model = XGBClassifier(**best_params_dict['XGBoost'][q_no - 1]).fit(train_X, train_y)
    rf_model = RandomForestClassifier(**best_params_dict['RandomForest'][q_no - 1]).fit(train_X, train_y)
    cat_model = CatBoostClassifier(**best_params_dict['CatBoost'][q_no - 1], cat_features=cat_features).fit(cat_train_X, train_y)
    lgbm_model = LGBMClassifier(**best_params_dict['LightGBM'][q_no - 1]).fit(train_X, train_y)

    ## Obtain the predicted probabilities for each class for each model
    rf_pred = rf_model.predict_proba(validate_X)[:, 1]
    xgb_pred = xgb_model.predict_proba(validate_X)[:, 1]
    cat_pred = cat_model.predict_proba(cat_validate_X)[:, 1]
    lgbm_pred = lgbm_model.predict_proba(validate_X)[:, 1]

    ## Stack the predictions
    stacked_predictions = np.column_stack((xgb_pred, rf_pred, cat_pred, lgbm_pred))

    ## Train the final_estimator
    final_estimator.fit(stacked_predictions, validate_y)

    ## Use the final_estimator to make the final prediction
    ensemble_pred = final_estimator.predict(stacked_predictions)
    
    model_name = 'ensemble_' + str(final_estimator)
    if model_name in stored_models:
        stored_models[model_name].append(pickle.dumps(final_estimator))
    else:
        stored_models[model_name] = [pickle.dumps(final_estimator)]

    ## Calculate the accuracy and F1 score using cross_val_score
    question_acc = np.mean(ensemble_pred == validate_y)
    question_f1 = f1_score(validate_y, ensemble_pred)
    overall_acc += question_acc
    overall_f1 += question_f1
    print("Question", q_no, "accuracy:", question_acc, "F1 score:", question_f1)

    print("Overall accuracy:", overall_acc)
    print("Overall F1 score:", overall_f1)

Question 1 accuracy: 0.7504774029280713 F1 score: 0.8535491905354919
Question 2 accuracy: 0.9838743899851474 F1 score: 0.9918507398670383
Question 3 accuracy: 0.9352853808614471 F1 score: 0.966398589842459
Question 4 accuracy: 0.8014003819223424 F1 score: 0.8882521489971347
Question 5 accuracy: 0.571398260131551 F1 score: 0.7083453652902107
Question 6 accuracy: 0.7829408020369192 F1 score: 0.8775583482944345
Question 7 accuracy: 0.7555697008274984 F1 score: 0.8573551263001485
Question 8 accuracy: 0.6437513261192447 F1 score: 0.7768770764119601
Question 9 accuracy: 0.7523870146403565 F1 score: 0.8535575354498683
Question 10 accuracy: 0.5465733078718439 F1 score: 0.4078692158492657
Question 11 accuracy: 0.6643326967960959 F1 score: 0.7896276595744681
Question 12 accuracy: 0.8703585826437513 F1 score: 0.9300515168860906
Question 13 accuracy: 0.7407171652875026 F1 score: 0.11449275362318839
Question 14 accuracy: 0.728835136855506 F1 score: 0.8375699034062023
Question 15 accuracy: 0.5690642