<a href="https://colab.research.google.com/github/rowanmacy/text-classifier/blob/main/Problem_2_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [72]:
# Import libraries
import pandas as pd
import numpy as np
import os
import textwrap

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')

import seaborn as sns
sns.set('notebook', font_scale=1.25, style='whitegrid')

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve

from scipy.stats import uniform

In [73]:
# TODOs

In [74]:
# Clone repo
!rm -rf /content/text-classifier  # clear any existing clone
!git clone "https://github.com/rowanmacy/text-classifier/"  # clone repo

Cloning into 'text-classifier'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 72 (delta 17), reused 30 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (72/72), 31.50 MiB | 13.52 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [75]:
# Check repo contents
!ls /content/text-classifier/starter_code/data_readinglevel/

x_test_BERT_embeddings.npz  x_train_BERT_embeddings.npz  y_train.csv
x_test.csv		    x_train.csv


## Read in data

In [76]:
# Read in training data
x_dev = np.array(np.load("/content/text-classifier/starter_code/data_readinglevel/x_train_BERT_embeddings.npz")['arr_0'])
y_dev = pd.read_csv("/content/text-classifier/starter_code/data_readinglevel/y_train.csv")
x_test  = np.array(np.load("/content/text-classifier/starter_code/data_readinglevel/x_test_BERT_embeddings.npz")['arr_0'])

In [77]:
# Convert coarse label into binary classification
y_dev['Class'] = (y_dev['Coarse Label'] == 'Key Stage 4-5').astype(int)
y_dev = y_dev['Class']

## Pre-defined split for proxy test set (validation)

In [78]:
X_train, X_val, y_train, y_val = train_test_split(x_dev, y_dev, test_size=0.2, random_state=42, shuffle=True)

## Set up pipeline

In [79]:
# Define pipeline for preprocessing and classifier
problem_1_pipeline = Pipeline([
     ('problem_2_model', LogisticRegression(max_iter=500, random_state=101, penalty='l1')),
])

## Create RandomizedSearchCV

In [80]:
# Define hyperparameter grid to search and scoring metric
distributions = dict()

# Model parameters
distributions['problem_2_model__C'] = np.logspace(-6, 6, 1000)
distributions['problem_2_model__solver'] = ['lbfgs', 'saga', 'liblinear']
distributions['problem_2_model__penalty'] = ['l1', 'l2']

# Preprocessor parameters
scoring_metric = 'roc_auc'

In [81]:
# Define RandomizedSearchCV and fit to training data
random_searcher = RandomizedSearchCV(
    problem_1_pipeline,
    distributions,
    n_iter=50,  # increases number of parameter combinations tried
    scoring=scoring_metric,
    cv=KFold(n_splits=5, shuffle=True), # Always good to shuffle
    refit=True, # Automatically retrain the best-performing model on all available data
    verbose = 1,
    random_state=101,

    error_score = 'raise')

In [None]:
# Fit random_searcher object to training data
random_searcher.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
# Print result of search
problem_1_hyp_results = pd.DataFrame(random_searcher.cv_results_).copy()
param_keys = ['param_'+str(key) for key in random_searcher.best_params_.keys()]

# Rearrange row order
problem_1_hyp_results.sort_values(param_keys, inplace=True)

# Visualize
problem_1_hyp_results[param_keys + ['mean_test_score', 'rank_test_score']]

In [None]:
# Capture best hyperparameters
problem_1_params = random_searcher.best_params_

for param, value in problem_1_params.items():
  print(param, ':', value)

## Make predictions and plot ROC curve

In [None]:
# Evaluate performance on training and validation data
yhat_tr = random_searcher.predict_proba(X_train)[:,1]
yhat_val = random_searcher.predict_proba(X_val)[:,1]

# Calculate AUROC for training and validation predictions
roc_auc_tr = roc_auc_score(y_train, yhat_tr)
roc_auc_val = roc_auc_score(y_val, yhat_val)
print('Training score:', round(roc_auc_tr, 3))
print('Validation score:', round(roc_auc_val, 3))

In [None]:
# Print confusion matrices at 50% threshold
print('Training CF:\n', confusion_matrix(y_train, yhat_tr >= 0.5))
print('\nValidation CF:\n', confusion_matrix(y_val, yhat_val >= 0.5))

In [None]:
# Print ROC curves
tr_fpr, tr_tpr, tr_thresholds = roc_curve(y_train, yhat_tr)
val_fpr, val_tpr, val_thresholds = roc_curve(y_val, yhat_val)

plt.figure(figsize=(5,5))
plt.plot(tr_fpr, tr_tpr, 'g.-', label='Training')
plt.plot(val_fpr, val_tpr, 'r.-', label='Validation')

plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Problem 1 ROC Curve')
plt.legend(loc='lower right')