In [63]:
## Imports
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import pickle
warnings.simplefilter(action='ignore')

In [64]:
## Load training data
train = pd.read_csv('train_preprocessed.csv')
labels = pd.read_csv('train_labels.csv')

labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]))
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

train['session_id'] = train['session_id'].astype(str)
labels['session'] = labels['session'].astype(str)

In [81]:
## Initialize dictionaries to store models and best parameters for each question for each model type
stored_models = {}
best_params_dict = {}

In [82]:
## Function to train models and evaluate on validation data
def train_model(train, labels, model_name, model_type, hyperparams, random_search):
    ## Initialize vars to store model scores
    overall_acc, overall_f1 = 0, 0

    ## Initialize array to store best parameters for each question model
    best_params_list = []

    ## Print model name
    print("model:", model_name)
    
    ## Train model for each of the 18 questions
    for q_no in range(1, 19):
        if q_no <= 3:
            select = train[train['level_group'] == '0-4']
        elif q_no <= 13:
            select = train[train['level_group'] == '5-12']
        else:
            select = train[train['level_group'] == '13-22']

        ## Create train and validation split
        train_X, validate_X = train_test_split(select, test_size=0.2)
        train_y = labels.loc[(labels['session'].isin(train_X['session_id'])) & (labels['q'] == q_no)]
        validate_y = labels.loc[(labels['session'].isin(validate_X['session_id'])) & (labels['q'] == q_no)]

        train_X = train_X.drop(['session_id', 'level_group'], axis=1)
        train_y = train_y['correct']

        validate_X = validate_X.drop(['session_id', 'level_group'], axis=1)
        validate_y = validate_y['correct']

        ## Instantiate the model
        model = model_type

        ## If boolean random_search is true
        if random_search:
            ## Create the RandomizedSearchCV object
            rs = RandomizedSearchCV(model, hyperparams, n_iter=20, cv=3, scoring='f1', n_jobs=-1, verbose=0, random_state=42)

            ## Fit the RandomizedSearchCV object to the data
            rs.fit(train_X, train_y)

            ## Obtain the best scoring hyperparameters
            best_params = random_search.best_params_
            best_params_list.append(best_params)
            # print("Best hyperparameters for question", q_no, ":", best_params)

            ## Get the best estimator
            model = random_search.best_estimator_
            
        model.fit(train_X, train_y)

        if model_name in stored_models:
            stored_models[model_name].append(pickle.dumps(model))
        else:
            stored_models[model_name] = [pickle.dumps(model)]

        ## Calculate the accuracy and F1 score using cross_val_score
        question_acc = np.mean(cross_val_score(model, validate_X, validate_y, cv=5))
        question_f1 = np.mean(cross_val_score(model, validate_X, validate_y, cv=5, scoring='f1'))
        overall_acc += question_acc
        overall_f1 += question_f1
        print("Question", q_no, "accuracy:", question_acc, "f1:", question_f1)

    overall_acc /= 18
    overall_f1 /= 18
    print("Overall accuracy:", overall_acc)
    print("Overall F1 score:", overall_f1)

    ## If randomize search executed return best params else return null
    return best_params_list if random_search else None
    


In [83]:
########## XGBoost ##########

## Define hyperparameter dict
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

## Set last argument to true if want to carry out randomized search for best hyperparams
res = train_model(
    train, 
    labels, 
    'XGBoost', 
    XGBClassifier(eval_metric='error', use_label_encoder=False), 
    params, 
    False)

if res:
    best_params_dict['XGBoost'] = res

model: XGBoost
Question 1 accuracy: 0.6851244953878506 f1: 0.8082134630470378
Question 2 accuracy: 0.9789946257258197 f1: 0.9893856768072089
Question 3 accuracy: 0.9374073798893626 f1: 0.9676924707543394
Question 4 accuracy: 0.7801827298250827 f1: 0.8755394760658388
Question 5 accuracy: 0.5088077756989146 f1: 0.5723452997113391
Question 6 accuracy: 0.7651174257519369 f1: 0.8657647032268541
Question 7 accuracy: 0.6976463065655303 f1: 0.8171270315241876
Question 8 accuracy: 0.5756408264719589 f1: 0.697825399240227
Question 9 accuracy: 0.7052828642382243 f1: 0.8228891381217327
Question 10 accuracy: 0.4860987092285766 f1: 0.491729945304916
Question 11 accuracy: 0.5951647292712196 f1: 0.7200174884469124
Question 12 accuracy: 0.8574164758540412 f1: 0.9231116057437465
Question 13 accuracy: 0.6982791965831595 f1: 0.14717664908184286
Question 14 accuracy: 0.6596652504880075 f1: 0.7870382708249072
Question 15 accuracy: 0.5113490171179751 f1: 0.4834194080501323
Question 16 accuracy: 0.69361278658

In [None]:
########## RandomForest ##########

## Define hyperparameter dict
params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(1, 50),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'criterion': ['gini', 'entropy']
}

## Set last argument to true if want to carry out randomized search for best hyperparams
res = train_model(
    train, 
    labels, 
    'RandomForest', 
    RandomForestClassifier(), 
    params, 
    False)

if res:
    best_params_dict['RandomForest'] = res

In [None]:
########## CatBoost ##########

## Load alternate train data for catboost
cat_train = pd.read_csv('CatBoostData.csv')

# Specify the categorical columns to be used by the CatBoost model
cat_features = ['event_name_mode', 'fqid_mode', 'room_fqid_mode', 'text_mode']

## Define hyperparameter dict
params = {
    'iterations': [100, 300, 500, 700],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'depth': [3, 4, 6, 8],
    'verbose': [0]
}

## Set last argument to true if want to carry out randomized search for best hyperparams
res = train_model(
    cat_train, 
    labels, 
    'CatBoost', 
    CatBoostClassifier(cat_features=cat_features), 
    params, 
    False)

## Add best hyperparameters to dictionary
if res:
    best_params_dict['CatBoost'] = res

In [None]:
########## LightGBM ##########

## Define hyperparameter dict
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 6, 10],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

## Set last argument to true if want to carry out randomized search for best hyperparams
res = train_model(
    train, 
    labels, 
    'LightGBM', 
    LGBMClassifier(), 
    params, 
    False)

if res:
    best_params_dict['LightGBM'] = res

In [26]:
## Train ensemble with final estimators
final_estimators = [LogisticRegression(), DecisionTreeClassifier(max_depth=7)]

for final_estimator in final_estimators:

    ## Print the final estimator being used for the ensemble
    print("Final estimator:", str(final_estimator))

    ## Initialize vars to store model scores
    overall_acc, overall_f1 = 0, 0

    ## Train model for each of the 18 questions
    for q_no in range(1, 19):
        if q_no <= 3:
            select = train[train['level_group'] == '0-4']
            cat_select = cat_train[cat_train['level_group'] == '0-4']
        elif q_no <= 13:
            select = train[train['level_group'] == '5-12']
            cat_select = cat_train[cat_train['level_group'] == '5-12']
        else:
            select = train[train['level_group'] == '13-22']
            cat_select = cat_train[cat_train['level_group'] == '13-22']

        ## Create train and validation splits
        train_X, validate_X = train_test_split(select, test_size=0.2)
        cat_train_X, cat_validate_X = train_test_split(cat_select, test_size=0.2)

        train_y = labels.loc[(labels['session'].isin(train_X['session_id'])) & (labels['q'] == q_no)]
        validate_y = labels.loc[(labels['session'].isin(validate_X['session_id'])) & (labels['q'] == q_no)]

        train_X = train_X.drop(['session_id', 'level_group'], axis=1)
        cat_train_X = cat_train_X.drop(['session_id', 'level_group'], axis=1)
        train_y = train_y['correct']

        validate_X = validate_X.drop(['session_id', 'level_group'], axis=1)
        cat_validate_X = cat_validate_X.drop(['session_id', 'level_group'], axis=1)
        validate_y = validate_y['correct']

        # Train the four models with the optimized hyperparameters
        xgb_model = XGBClassifier(best_params_dict['XGBoost'][q_no - 1]).fit(train_X, train_y)
        rf_model = RandomForestClassifier(best_params_dict['RandomForest'][q_no - 1]).fit(train_X, train_y)
        cat_model = CatBoostClassifier(best_params_dict['CatBoost'][q_no - 1], cat_features=cat_features).fit(cat_train_X, train_y)
        lgbm_model = LGBMClassifier(best_params_dict['LightGBM'][q_no - 1]).fit(train_X, train_y)

        ## Obtain the predicted probabilities for each class for each model
        rf_pred = rf_model.predict_proba(validate_X)[:, 1]
        xgb_pred = xgb_model.predict_proba(validate_X)[:, 1]
        cat_pred = cat_model.predict_proba(cat_validate_X)[:, 1]
        lgbm_pred = lgbm_model.predict_proba(validate_X)[:, 1]

        ## Stack the predictions
        stacked_predictions = np.column_stack((xgb_pred, rf_pred, cat_pred, lgbm_pred))

        ## Train the final_estimator
        final_estimator.fit(stacked_predictions, validate_y)

        ## Use the meta-learner to make the final prediction
        ensemble_pred = final_estimator.predict(stacked_predictions)

        ## Calculate the accuracy and F1 score using cross_val_score
        question_acc = np.mean(ensemble_pred == validate_y)
        question_f1 = f1_score(validate_y, ensemble_pred)
        overall_acc += question_acc
        overall_f1 += question_f1
        print("Question", q_no, "accuracy:", question_acc, "f1:", question_f1)

    print("Overall accuracy:", overall_acc)
    print("Overall F1 score:", overall_f1)

    

LogisticRegression()


NameError: name 'cat_train' is not defined

In [137]:
test = pd.read_csv('test_preprocessed.csv')


In [138]:
preds = pd.DataFrame()

def append_qnum(col, qnum):
    return str(col) + '_q' + str(qnum)

for q_no in range(1, 19):
    if q_no <= 3:
        select = test[test['level_group'] == '0-4']
    elif q_no <= 13:
        select = test[test['level_group'] == '5-12']
    else:
        select = test[test['level_group'] == '13-22']

    session_id = select['session_id'].apply(append_qnum, qnum=q_no).tolist()
    select = select.drop(['session_id','level_group'], axis=1)

    model = pickle.loads(stored_models['XGBoost'][q_no-1])
    pred = model.predict(select)
    
    df = pd.DataFrame({'session_id': session_id, 'correct': pred})
    preds = pd.concat([preds, df], ignore_index=True)

In [131]:
preds
preds.to_csv('submission.csv', index=False)