# Loading Data


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, roc_curve, auc, roc_auc_score, accuracy_score, f1_score
)

# course helpers (already in your repo)
from cross_validation import make_train_and_test_row_ids_for_n_fold_cv, train_models_and_calc_scores_for_n_fold_cv

np.set_printoptions(threshold=50, linewidth=160, suppress=True)
# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')


In [20]:
x_train_df = pd.read_csv('x_train.csv')
tr_list_of_text = x_train_df['text'].values.tolist()


x_train_df.head()          # first rows
x_train_df.info()          # column types & non-null counts
print(x_train_df['text'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   author                         5557 non-null   object 
 1   title                          5557 non-null   object 
 2   passage_id                     5557 non-null   int64  
 3   text                           5557 non-null   object 
 4   char_count                     5557 non-null   float64
 5   word_count                     5557 non-null   float64
 6   sentence_count                 5557 non-null   float64
 7   avg_word_length                5557 non-null   float64
 8   avg_sentence_length            5557 non-null   float64
 9   type_token_ratio               5557 non-null   float64
 10  pronoun_freq                   5557 non-null   float64
 11  function_words_count           5557 non-null   float64
 12  punctuation_frequency          5557 non-null   f

#### Declare tokenize text function from lab 10

In [48]:
def tokenize_text(raw_text):
    ''' Transform a plain-text string into a list of tokens
    
    We assume that *whitespace* divides tokens.
    
    Args
    ----
    raw_text : string
    
    Returns
    -------
    list_of_tokens : list of strings
        Each element is one token in the provided text
    '''
    list_of_tokens = raw_text.split() # split method divides on whitespace by default
    for pp in range(len(list_of_tokens)):
        cur_token = list_of_tokens[pp]
        # Remove punctuation
        for punc in ['?', '!', '_', '.', ',', '"', '/', '--']:
            cur_token = cur_token.replace(punc, "")
        # Turn to lower case
        clean_token = cur_token.lower()
        # Replace the cleaned token into the original list
        list_of_tokens[pp] = clean_token
    return list_of_tokens
     

### Add column "tokens" to x_train
#### - each row now contains a list of each tokenized word from that samples text

In [161]:
x_train_df["tokens"] = x_train_df["text"].astype(str).apply(tokenize_text)

for line in x_train_df['tokens'][:5]:
    print("\nTokenized text per sample:")
    print(line)  


Tokenized text per sample:
['yes', 'what', 'sort', 'of', 'terms', 'was', 'he', 'on', 'with', 'the', 'guests—you', 'and', 'miss', 'norris', 'and', 'all', 'of', 'them', 'just', 'polite', 'and', 'rather', 'silent', 'you', 'know', 'keeping', 'himself', 'to', 'himself', 'we', "didn't", 'see', 'so', 'very', 'much', 'of', 'him', 'except', 'at', 'meals', 'we', 'were', 'here', 'to', 'enjoy', 'ourselves', 'and—well', 'he', "wasn't", 'he', "wasn't", 'there', 'when', 'the', 'ghost', 'walked', 'no', 'i', 'heard', 'mark', 'calling', 'for', 'him', 'when', 'he', 'went', 'back', 'to', 'the', 'house', 'i', 'expect', 'cayley', 'stroked', 'down', 'his', 'feathers', 'a', 'bit', 'and', 'told', 'him', 'that', 'girls', 'will', 'be', 'girls—hallo', 'here', 'we', 'are']

Tokenized text per sample:
['perhaps', 'i', 'should', 'say', 'that', 'it', 'was', "mark's", 'private', 'plan', 'my', 'own', 'was', 'different', 'the', 'announcement', 'at', 'breakfast', 'went', 'well', 'after', 'the', 'golfing-party', 'had', '

In [101]:
token_count_dict = dict()

for row in x_train_df['tokens']:
    for token in row:
        if not token:   # skip empty tokens caused by punctuation stripping
            continue
        if token in token_count_dict:
            token_count_dict[token] += 1
        else:
            token_count_dict[token] = 1


sorted_tokens = list(sorted(token_count_dict, key=token_count_dict.get, reverse=True))

print("\n Number of unique Tokens: ")
print(len(sorted_tokens))

print("\n TOP 10 TOKENS (with count)")
for w in sorted_tokens[:10]:
    print(f"{token_count_dict[w]:5d} {w}")

print("\n BOTTOM 10 TOKENS:")
for w in sorted_tokens[-10:]:
    print(f"{token_count_dict[w]:5d} {w}")


 Number of unique Tokens: 
34791

 TOP 10 TOKENS (with count)
24175 the
14488 and
12029 of
11904 to
 9227 a
 6779 in
 6730 i
 5982 he
 5501 that
 5258 was

 BOTTOM 10 TOKENS:
    1 honorable;
    1 saucy
    1 brandied
    1 absinthe
    1 teased
    1 coquenard
    1 madinier
    1 hearse
    1 lumpy
    1 monumental


#### Create our finite list of vocab (only tokens that appear > 4 times)

In [102]:
vocab = [w for w in sorted_tokens ]#if token_count_dict[w] >= 4]
print(f"\n Vocab size (unfiltered)): {len(vocab)}")


 Vocab size (unfiltered)): 34791


# Featurizing Text

In [166]:
def transform_text_into_feature_vector(token_list, vocab_dict):
    """Return a count vector for a provided *tokenized* text (list of tokens)."""
    vocabulary_size = len(vocab_dict)
    counts_vector = np.zeros(vocabulary_size, dtype=int)
    
    for token in token_list:
        if token in vocab_dict:
            vocab_index = vocab_dict[token]
            counts_vector[vocab_index] += 1
    return counts_vector


#### Test featurization function with first row

In [167]:
print(transform_text_into_feature_vector(x_train_df["tokens"].iloc[0], vocab_dict))


[3 4 3 3 1 0 2 4 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 1 1 3 0 0 0 0 1 0 0 0 0 0 1 1 0 0 3 1 1 0 0 1 2 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1
 0 0 0 0 0 0 2 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 

# Using Classifier

#### Load y_train labels

In [None]:
import pandas as pd
import numpy as np

# Load labels
y_train_df = pd.read_csv("y_train.csv")

# Basic column checks
assert "Coarse Label" in y_train_df.columns, "Missing 'Coarse Label' in y_train.csv"
assert len(y_train_df) == len(x_train_df), "Row counts differ; are files aligned the same way?"

# Peek unique coarse labels
print("Unique coarse labels in y_train:", y_train_df["Coarse Label"].unique())

# Map to {0,1}
label_map = {"Key Stage 2-3": 0, "Key Stage 4-5": 1}
y_coarse = y_train_df.iloc[:, 3]            # 4th column = 'Coarse Label'
y_tr_N = y_coarse.map(label_map).to_numpy() # int labels

# Sanity checks
assert y_tr_N.shape[0] == len(x_train_df), "Label length mismatch"
assert not np.isnan(y_tr_N).any(), "Found NaNs after mapping — unexpected label values?"

# Class counts + preview
print("y_tr_N shape:", y_tr_N.shape)
print("Class counts:", (y_tr_N == 0).sum(), "(KS2-3),", (y_tr_N == 1).sum(), "(KS4-5)")
print("First 10 labels:", y_tr_N[:10])


Unique coarse labels in y_train: ['Key Stage 2-3' 'Key Stage 4-5']
y_tr_N shape: (5557,)
Class counts: 2509 (KS2-3), 3048 (KS4-5)
First 10 labels: [0 0 0 0 0 0 0 0 0 0]


# Building BOW from only Training Data

In [None]:

def build_vocab_from_ids(train_ids, min_count=4):
    """Build a vocabulary from TRAIN rows only."""
    tok_count = {}
    for i in train_ids:
        # grab the tokenized list of words from each row
        toks = x_train_df.loc[i, 'tokens']
        for t in toks:
            # skip empty strings/tokens
            if t:
                tok_count[t] = tok_count.get(t, 0) + 1

    # only keep tokens that meet frequency threshold
    vocab = [token for token, c in tok_count.items() if c >= min_count]
    vocab.sort()
    # return vocab as list of tokens
    return vocab

# makes vocab dict from vocab list, key value = {word, count}
def make_vocab_dict(vocab):
    # Map token -> column index
    vocab_dict = dict()
    for vocab_id, tok in enumerate(vocab):
        vocab_dict[tok] = vocab_id
    return vocab_dict

# Function to build feature matrix
def featurize_ids_to_matrix(row_ids, vocab_dict):
    # Build (num rows, V) matrix using transform_text_into_feature_vector

    # Num features (num unique tokens)
    V = len(vocab_dict)
    # Initialize feature matrix
    X = np.zeros((len(row_ids), V), dtype=int)

    # iterate over input rows with their position in X -> build feature matrix
    for rr, idx in enumerate(row_ids):
        # grab token list for this document
        tokens = x_train_df.loc[idx, 'tokens']
        # use helper function to convert to feature vector
        X[rr] = transform_text_into_feature_vector(tokens, vocab_dict)
    return X


# Finding Best Model: K-Fold Validation

In [189]:
from sklearn.metrics import roc_auc_score
from cross_validation import make_train_and_test_row_ids_for_n_fold_cv

num_examples = len(x_train_df)
all_row_indices = np.arange(num_examples)

# C grid of values to sweep
C_grid = np.r_[1e-5, 3e-5, 1e-4, 3e-4, np.logspace(-3, 1, 9)]

# Using our homework function to make K folds
train_ids_per_fold, val_ids_per_fold = make_train_and_test_row_ids_for_n_fold_cv(
    n_examples=num_examples, n_folds=5, random_state=0
)

cv_summary = []
per_C_fold_aurocs = {}

# Sweep across c_values using K-fold
for c_value in C_grid:
    fold_aurocs = []
    for fold_index in range(len(train_ids_per_fold)):
        train_indices = train_ids_per_fold[fold_index]
        val_indices   = val_ids_per_fold[fold_index]

        # vocabulary from training data only (doesn't leak w/ validation)
        vocabulary  = build_vocab_from_ids(train_indices, min_count=4)
        vocab_index = make_vocab_dict(vocabulary)

        # featurize train/val with this fold's vocabulary
        X_train_fold = featurize_ids_to_matrix(train_indices, vocab_index)
        X_val_fold   = featurize_ids_to_matrix(val_indices,   vocab_index)

        y_train_fold = y_tr_N[train_indices]
        y_val_fold   = y_tr_N[val_indices]

        # train logistic regression
        clf = sklearn.linear_model.LogisticRegression(
            C=c_value, penalty="l2", solver="lbfgs",
            max_iter=5000, tol=1e-3, random_state=0
        )
        clf.fit(X_train_fold, y_train_fold)

        # AUROC on validation fold (project metric)
        val_prob_pos = clf.predict_proba(X_val_fold)[:, 1]
        fold_aurocs.append(roc_auc_score(y_val_fold, val_prob_pos))


    # Used chat to make clean print display per C_Val
    mean_val_auroc = float(np.mean(fold_aurocs))
    per_C_fold_aurocs[c_value] = fold_aurocs
    cv_summary.append((c_value, mean_val_auroc))
    print(
        f"C={c_value: .5g} | mean AUROC={mean_val_auroc:.4f} "
        f"| folds={np.array2string(np.array(fold_aurocs), precision=4)}"
    )

best_C, best_mean_auroc = max(cv_summary, key=lambda item: item[1])
print("\nBest C:", best_C, "| best mean val AUROC:", best_mean_auroc)


C= 1e-05 | mean AUROC=0.6870 | folds=[0.664  0.6711 0.7041 0.7225 0.6733]
C= 3e-05 | mean AUROC=0.6926 | folds=[0.6715 0.6748 0.7098 0.7281 0.6789]
C= 0.0001 | mean AUROC=0.7034 | folds=[0.687  0.6831 0.7201 0.7361 0.6908]
C= 0.0003 | mean AUROC=0.7162 | folds=[0.7052 0.6947 0.7316 0.7454 0.7041]
C= 0.001 | mean AUROC=0.7311 | folds=[0.7236 0.7123 0.7458 0.7555 0.7185]
C= 0.0031623 | mean AUROC=0.7471 | folds=[0.74   0.7335 0.7607 0.767  0.7341]
C= 0.01 | mean AUROC=0.7650 | folds=[0.7567 0.7563 0.7767 0.783  0.7522]
C= 0.031623 | mean AUROC=0.7827 | folds=[0.7705 0.7756 0.7929 0.8019 0.7726]
C= 0.1 | mean AUROC=0.7944 | folds=[0.7796 0.7856 0.8016 0.8152 0.7902]
C= 0.31623 | mean AUROC=0.7974 | folds=[0.7809 0.7838 0.8025 0.8202 0.7996]
C= 1 | mean AUROC=0.7931 | folds=[0.7777 0.7775 0.7943 0.8185 0.7977]
C= 3.1623 | mean AUROC=0.7884 | folds=[0.7744 0.7704 0.7888 0.8151 0.7933]
C= 10 | mean AUROC=0.7881 | folds=[0.7722 0.7693 0.7907 0.8149 0.7935]

Best C: 0.31622776601683794 | best 

# Run best model on ALL Training Data

In [180]:
all_indices = np.arange(len(x_train_df))
final_vocabulary   = build_vocab_from_ids(all_indices, min_count=4)
final_vocab_index  = make_vocab_dict(final_vocabulary)
X_all_train        = featurize_ids_to_matrix(all_indices, final_vocab_index)

clf_final = sklearn.linear_model.LogisticRegression(
    C=best_C, penalty="l2", solver="lbfgs",
    max_iter=5000, tol=1e-3, random_state=0
)
clf_final.fit(X_all_train, y_tr_N)

print("Final accuracy:", clf_final.score(X_all_train, y_tr_N))
print("Final AUROC:",
      roc_auc_score(y_tr_N, clf_final.predict_proba(X_all_train)[:, 1]))
print("Final vocabulary size:", len(final_vocabulary))


Final accuracy: 0.9731869713874393
Final AUROC: 0.9969863086066015
Final vocabulary size: 8119


## Visualize top weights

In [185]:
weights = clf_final.coef_[0]
sorted_indices = np.argsort(weights)
top_k = 20

print("\nMost negative:")
for index in sorted_indices[:top_k]:
    print(f"{weights[index]:9.4f}  {final_vocabulary[index]}")

print("\nMost positive:")
for index in sorted_indices[-top_k:][::-1]:
    print(f"{weights[index]:9.4f}  {final_vocabulary[index]}")



Most negative:
  -1.4321  myles
  -1.2932  robin
  -1.2238  margaret
  -1.1637  philip
  -1.1391  alice
  -1.1160  oblomov
  -1.0481  pinocchio
  -1.0357  bambi
  -0.9444  alessandro
  -0.8904  west
  -0.8376  smith
  -0.8299  carrie
  -0.8244  arkady
  -0.8086  kim
  -0.8060  dick
  -0.8007  elnora
  -0.7984  story
  -0.7958  barbicane
  -0.7944  minutes
  -0.7729  d'artagnan

Most positive:
   1.1240  julien
   0.9864  trina
   0.9640  k
   0.9410  ann
   0.9043  'i
   0.9019  jurgis
   0.7750  gervaise
   0.7621  peterkin
   0.7611  dedalus
   0.7007  hath
   0.6934  coupeau
   0.6848  husband
   0.6798  unto
   0.6777  therefore
   0.6754  charles
   0.6680  noble
   0.6665  amongst
   0.6552  asleep
   0.6499  martin
   0.6350  orders


# Run Best Model on Test Data -> Outputs yproba1__test.txt

In [187]:
# Train final model on all training data (already built in previous cell) and score x_test
# Assumes the following already exist from the previous cell:
# all_indices, final_vocabulary, final_vocab_index, X_all_train, y_tr_N, best_C
# Also assumes tokenize_text and transform_text_into_feature_vector are defined.

# 1) Load and tokenize x_test exactly like x_train
x_test_df = pd.read_csv("x_test.csv")
x_test_df["tokens"] = x_test_df["text"].astype(str).apply(tokenize_text)

# 2) Featurize x_test using the SAME final_vocab_index
num_test = len(x_test_df)
V = len(final_vocab_index)
X_all_test = np.zeros((num_test, V), dtype=int)
for row_pos in range(num_test):
    tokens_row = x_test_df.loc[row_pos, "tokens"]
    X_all_test[row_pos] = transform_text_into_feature_vector(tokens_row, final_vocab_index)

# 3) Fit final Logistic Regression and produce probabilities for leaderboard
final_clf = sklearn.linear_model.LogisticRegression(C=best_C, penalty="l2", solver="lbfgs", max_iter=5000, tol=1e-3, random_state=0)
final_clf.fit(X_all_train, y_tr_N)
yproba_test = final_clf.predict_proba(X_all_test)[:, 1]

# 4) Save to disk (one probability per line)
np.savetxt("yproba1_test.txt", yproba_test, fmt="%.6f")

print("Saved yproba1_test.txt", "| test shape:", yproba_test.shape, "| first 5:", yproba_test[:5])



Saved yproba1_test.txt | test shape: (1197,) | first 5: [0.94926825 0.98407203 0.94275232 0.93411521 0.97585852]
