# Loading Data


In [272]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, roc_curve, auc, roc_auc_score, accuracy_score, f1_score
)

# course helpers (already in your repo)
from cross_validation import make_train_and_test_row_ids_for_n_fold_cv, train_models_and_calc_scores_for_n_fold_cv

np.set_printoptions(threshold=50, linewidth=160, suppress=True)
# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')


In [273]:
x_train_df = pd.read_csv('x_train.csv')
tr_list_of_text = x_train_df['text'].values.tolist()


x_train_df.head()          # first rows
x_train_df.info()          # column types & non-null counts
print(x_train_df['text'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   author                         5557 non-null   object 
 1   title                          5557 non-null   object 
 2   passage_id                     5557 non-null   int64  
 3   text                           5557 non-null   object 
 4   char_count                     5557 non-null   float64
 5   word_count                     5557 non-null   float64
 6   sentence_count                 5557 non-null   float64
 7   avg_word_length                5557 non-null   float64
 8   avg_sentence_length            5557 non-null   float64
 9   type_token_ratio               5557 non-null   float64
 10  pronoun_freq                   5557 non-null   float64
 11  function_words_count           5557 non-null   float64
 12  punctuation_frequency          5557 non-null   f

#### Declare tokenize text function from lab 10

In [270]:
def tokenize_text(raw_text):
    ''' Transform a plain-text string into a list of tokens
    
    We assume that *whitespace* divides tokens.
    
    Args
    ----
    raw_text : string
    
    Returns
    -------
    list_of_tokens : list of strings
        Each element is one token in the provided text
    '''
    list_of_tokens = raw_text.split() # split method divides on whitespace by default
    for pp in range(len(list_of_tokens)):
        cur_token = list_of_tokens[pp]
        # Remove punctuation
        for punc in ['?', '!', '_', '.', ',', '"', '/', '--']:
            cur_token = cur_token.replace(punc, "")
        # Turn to lower case
        clean_token = cur_token
        # Replace the cleaned token into the original list
        list_of_tokens[pp] = clean_token
    return list_of_tokens
     

In [None]:
import re

# simple default stop list (tweak as needed)
DEFAULT_STOPWORDS = {
    "the", "or", "was", "is", "and", "I", "c"
}

# precompile a lightweight cleaner: keep letters/numbers and internal apostrophes/hyphens
_CLEAN_RE = re.compile(r"[^\w'-]+")  # removes chars other than letters/digits/_/'/-

def tokenize_text(raw_text, stopwords=DEFAULT_STOPWORDS, remove_stopwords=True):
    """
    Transform plain text into a list of tokens:
      - lowercase
      - strip simple punctuation
      - optionally drop stop words

    Args
    ----
    raw_text : str
    stopwords : set[str]         # custom stop list (lowercase)
    remove_stopwords : bool

    Returns
    -------
    list[str]
    """
    if not raw_text:
        return []

    # split on whitespace, then clean each token
    toks = []
    for tok in raw_text.split():
        tok = _CLEAN_RE.sub("", tok)    # remove punctuation clusters
        tok = tok.strip("-'")           # trim stray leading/trailing dashes/apostrophes
        if not tok:
            continue
        if remove_stopwords and tok in stopwords:
            continue
        toks.append(tok)
    return toks


### Add column "tokens" to x_train
#### - each row now contains a list of each tokenized word from that samples text

In [303]:
x_train_df["tokens"] = x_train_df["text"].astype(str).apply(tokenize_text)

## Attempt to filter passage specific words

In [304]:
unique_dict = dict()
x_train_df["uniq_tokens"] = x_train_df["tokens"].apply(set)

doc_freq = x_train_df["uniq_tokens"].explode().value_counts()

only_once = set(doc_freq[doc_freq == 1].index)

unique_dict = {tok: (1 if tok in only_once else 0) for tok in doc_freq.index}


x_train_df["tokens_noUnique"] = x_train_df["tokens"].apply(
    lambda toks: [t for t in toks if t not in only_once]
)

# for i in x_train_df:
#     for token in np.unique(x_train_df['tokens']):
#         if token in unique_dict:
#             unique_dict[token] = 0
#         else:
#             unique_dict[token] = 1



# Take dup-free text np.
    # Take Word
        # check if in Unique
            # if in Unique already set value = 0
            # if not add to Unique set value = 1 

import pandas as pd

# tokens column keeps original casing: list[str] per row
s = x_train_df["tokens"]          # or your column name

# count, per lowercase form, how often it appears capitalized vs total
def is_cap(t): 
    return t[:1].isupper() and not t.isupper()   # avoid killing acronyms like "USA"

tot = {}
cap = {}
for toks in s.dropna():
    for t in toks:
        tl = t.lower()
        tot[tl] = tot.get(tl, 0) + 1
        cap[tl] = cap.get(tl, 0) + (1 if is_cap(t) else 0)

# mark terms to drop if mostly capitalized (tunable threshold)
THRESH = 0.8
drop_names = {t for t, c in cap.items() if c / tot[t] >= THRESH and len(t) > 1}

# filter each doc; preserve original order
x_train_df["tokens_noProper"] = s.apply(
    lambda toks: [t for t in toks if t.lower() not in drop_names]
)






In [305]:
token_count_dict = dict()

for row in x_train_df['tokens_noProper']:
    for token in row:
        if not token:   # skip empty tokens caused by punctuation stripping
            continue
        if token in token_count_dict:
            token_count_dict[token] += 1
        else:
            token_count_dict[token] = 1


sorted_tokens = list(sorted(token_count_dict, key=token_count_dict.get, reverse=True))

print("\n Number of unique Tokens: ")
print(len(sorted_tokens))

print("\n TOP 10 TOKENS (with count)")
for w in sorted_tokens[:10]:
    print(f"{token_count_dict[w]:5d} {w}")

print("\n BOTTOM 10 TOKENS:")
for w in sorted_tokens[-10:]:
    print(f"{token_count_dict[w]:5d} {w}")


 Number of unique Tokens: 
28667

 TOP 10 TOKENS (with count)
11876 of
11697 to
 8847 a
 6418 in
 5255 that
 4688 he
 4184 his
 4132 it
 3437 with
 3378 had

 BOTTOM 10 TOKENS:
    1 whole-hearted
    1 squinted
    1 undressing
    1 saucy
    1 brandied
    1 absinthe
    1 teased
    1 hearse
    1 lumpy
    1 monumental


#### Create our finite list of vocab (only tokens that appear > 4 times)

In [307]:
vocab = [w for w in sorted_tokens ]#if token_count_dict[w] >= 4]
print(f"\n Vocab size (unfiltered)): {len(vocab)}")


 Vocab size (unfiltered)): 28667


# Featurizing Text

In [308]:
def transform_text_into_feature_vector(token_list, vocab_dict):
    """Return a count vector for a provided *tokenized* text (list of tokens)."""
    vocabulary_size = len(vocab_dict)
    counts_vector = np.zeros(vocabulary_size, dtype=int)
    
    for token in token_list:
        if token in vocab_dict:
            vocab_index = vocab_dict[token]
            counts_vector[vocab_index] += 1
    return counts_vector


#### Test featurization function with first row

In [309]:
print(transform_text_into_feature_vector(x_train_df["tokens_noProper"].iloc[0], token_count_dict))


[0 3 0 ... 0 0 0]


# Using Classifier

#### Load y_train labels

In [310]:
import pandas as pd
import numpy as np

# Load labels
y_train_df = pd.read_csv("y_train.csv")

# Basic column checks
assert "Coarse Label" in y_train_df.columns, "Missing 'Coarse Label' in y_train.csv"
assert len(y_train_df) == len(x_train_df), "Row counts differ; are files aligned the same way?"

# Peek unique coarse labels
print("Unique coarse labels in y_train:", y_train_df["Coarse Label"].unique())

# Map to {0,1}
label_map = {"Key Stage 2-3": 0, "Key Stage 4-5": 1}
y_coarse = y_train_df.iloc[:, 3]            # 4th column = 'Coarse Label'
y_tr_N = y_coarse.map(label_map).to_numpy() # int labels

# Sanity checks
assert y_tr_N.shape[0] == len(x_train_df), "Label length mismatch"
assert not np.isnan(y_tr_N).any(), "Found NaNs after mapping — unexpected label values?"

# Class counts + preview
print("y_tr_N shape:", y_tr_N.shape)
print("Class counts:", (y_tr_N == 0).sum(), "(KS2-3),", (y_tr_N == 1).sum(), "(KS4-5)")
print("First 10 labels:", y_tr_N[:10])


Unique coarse labels in y_train: ['Key Stage 2-3' 'Key Stage 4-5']
y_tr_N shape: (5557,)
Class counts: 2509 (KS2-3), 3048 (KS4-5)
First 10 labels: [0 0 0 0 0 0 0 0 0 0]


# Building BOW from only Training Data

In [312]:

def build_vocab_from_ids(train_ids, min_count=1):
    """Build a vocabulary from TRAIN rows only."""
    tok_count = {}
    for i in train_ids:
        # grab the tokenized list of words from each row
        toks = x_train_df.loc[i, 'tokens_noProper']
        for t in toks:
            # skip empty strings/tokens
            if t:
                tok_count[t] = tok_count.get(t, 0) + 1

    # only keep tokens that meet frequency threshold
    vocab = [token for token, c in tok_count.items() if c >= min_count]
    vocab.sort()
    # return vocab as list of tokens
    return vocab

# makes vocab dict from vocab list, key value = {word, count}
def make_vocab_dict(vocab):
    # Map token -> column index
    vocab_dict = dict()
    for vocab_id, tok in enumerate(vocab):
        vocab_dict[tok] = vocab_id
    return vocab_dict

# Function to build feature matrix
def featurize_ids_to_matrix(row_ids, vocab_dict):
    # Build (num rows, V) matrix using transform_text_into_feature_vector

    # Num features (num unique tokens)
    V = len(vocab_dict)
    # Initialize feature matrix
    X = np.zeros((len(row_ids), V), dtype=int)

    # iterate over input rows with their position in X -> build feature matrix
    for rr, idx in enumerate(row_ids):
        # grab token list for this document
        tokens = x_train_df.loc[idx, 'tokens_noProper']
        # use helper function to convert to feature vector
        X[rr] = transform_text_into_feature_vector(tokens, vocab_dict)
    return X


# Finding Best Model: K-Fold Validation

In [313]:
from sklearn.metrics import roc_auc_score
from cross_validation import make_train_and_test_row_ids_for_n_fold_cv

num_examples = len(x_train_df)
all_row_indices = np.arange(num_examples)

# C grid of values to sweep
C_grid = np.r_[1e-5, 3e-5, 1e-4, 3e-4, 5e-5, np.logspace(-3, 1, 9)]

# Using our homework function to make K folds
train_ids_per_fold, val_ids_per_fold = make_train_and_test_row_ids_for_n_fold_cv(
    n_examples=num_examples, n_folds=5, random_state=0
)

cv_summary = []
per_C_fold_aurocs = {}

# Sweep across c_values using K-fold
for c_value in C_grid:
    fold_aurocs = []
    for fold_index in range(len(train_ids_per_fold)):
        train_indices = train_ids_per_fold[fold_index]
        val_indices   = val_ids_per_fold[fold_index]

        # vocabulary from training data only (doesn't leak w/ validation)
        vocabulary  = build_vocab_from_ids(train_indices, min_count=1)
        vocab_index = make_vocab_dict(vocabulary)

        # featurize train/val with this fold's vocabulary
        X_train_fold = featurize_ids_to_matrix(train_indices, vocab_index)
        X_val_fold   = featurize_ids_to_matrix(val_indices,   vocab_index)

        y_train_fold = y_tr_N[train_indices]
        y_val_fold   = y_tr_N[val_indices]

        # train logistic regression
        clf = sklearn.linear_model.LogisticRegression(
            C=c_value, penalty="l2", solver="lbfgs",
            max_iter=5000, tol=1e-3, random_state=0
        )
        clf.fit(X_train_fold, y_train_fold)

        # AUROC on validation fold (project metric)
        val_prob_pos = clf.predict_proba(X_val_fold)[:, 1]
        fold_aurocs.append(roc_auc_score(y_val_fold, val_prob_pos))


    # Used chat to make clean print display per C_Val
    mean_val_auroc = float(np.mean(fold_aurocs))
    per_C_fold_aurocs[c_value] = fold_aurocs
    cv_summary.append((c_value, mean_val_auroc))
    print(
        f"C={c_value: .5g} | mean AUROC={mean_val_auroc:.4f} "
        f"| folds={np.array2string(np.array(fold_aurocs), precision=4)}"
    )

best_C, best_mean_auroc = max(cv_summary, key=lambda item: item[1])
print("\nBest C:", best_C, "| best mean val AUROC:", best_mean_auroc)


C= 1e-05 | mean AUROC=0.6832 | folds=[0.6682 0.6714 0.6988 0.7004 0.6774]
C= 3e-05 | mean AUROC=0.6853 | folds=[0.6699 0.6736 0.7007 0.7027 0.6793]
C= 0.0001 | mean AUROC=0.6910 | folds=[0.6755 0.6799 0.706  0.7088 0.6851]
C= 0.0003 | mean AUROC=0.7029 | folds=[0.687  0.6926 0.7175 0.7205 0.6971]
C= 5e-05 | mean AUROC=0.6872 | folds=[0.6718 0.6756 0.7021 0.7049 0.6815]
C= 0.001 | mean AUROC=0.7209 | folds=[0.7074 0.7128 0.7341 0.7364 0.714 ]
C= 0.0031623 | mean AUROC=0.7392 | folds=[0.7298 0.7362 0.7497 0.7516 0.7286]
C= 0.01 | mean AUROC=0.7527 | folds=[0.7434 0.7542 0.76   0.764  0.7417]
C= 0.031623 | mean AUROC=0.7590 | folds=[0.7457 0.761  0.7659 0.7724 0.7502]
C= 0.1 | mean AUROC=0.7576 | folds=[0.7373 0.7579 0.7669 0.7723 0.7537]
C= 0.31623 | mean AUROC=0.7503 | folds=[0.7274 0.7448 0.7623 0.7648 0.7521]
C= 1 | mean AUROC=0.7419 | folds=[0.7156 0.7333 0.7546 0.7575 0.7484]
C= 3.1623 | mean AUROC=0.7369 | folds=[0.7092 0.7282 0.7518 0.7511 0.7444]
C= 10 | mean AUROC=0.7333 | folds

# Run best model on ALL Training Data

In [314]:
all_indices = np.arange(len(x_train_df))
final_vocabulary   = build_vocab_from_ids(all_indices, min_count=4)
final_vocab_index  = make_vocab_dict(final_vocabulary)
X_all_train        = featurize_ids_to_matrix(all_indices, final_vocab_index)

clf_final = sklearn.linear_model.LogisticRegression(
    C=best_C, penalty="l2", solver="lbfgs",
    max_iter=5000, tol=1e-3, random_state=0
)
clf_final.fit(X_all_train, y_tr_N)

print("Final accuracy:", clf_final.score(X_all_train, y_tr_N))
print("Final AUROC:",
      roc_auc_score(y_tr_N, clf_final.predict_proba(X_all_train)[:, 1]))
print("Final vocabulary size:", len(final_vocabulary))


Final accuracy: 0.8137484254093935
Final AUROC: 0.8990822017116334
Final vocabulary size: 7611


## Visualize top weights

In [315]:
weights = clf_final.coef_[0]
sorted_indices = np.argsort(weights)
top_k = 20

print("\nMost negative:")
for index in sorted_indices[:top_k]:
    print(f"{weights[index]:9.4f}  {final_vocabulary[index]}")

print("\nMost positive:")
for index in sorted_indices[-top_k:][::-1]:
    print(f"{weights[index]:9.4f}  {final_vocabulary[index]}")



Most negative:
  -0.3626  It's
  -0.3614  talk
  -0.3520  asked
  -0.3036  Doctor
  -0.3017  story
  -0.3002  It
  -0.2992  minutes
  -0.2943  We
  -0.2760  He
  -0.2692  years
  -0.2657  So
  -0.2653  right
  -0.2612  boys
  -0.2581  uncle
  -0.2581  There
  -0.2555  She
  -0.2529  knew
  -0.2515  school
  -0.2465  feet
  -0.2378  D'Artagnan

Most positive:
   0.3450  Sir
   0.3336  K
   0.3013  therefore
   0.2531  almost
   0.2442  unto
   0.2434  which
   0.2422  husband
   0.2079  court
   0.2052  hath
   0.2033  women
   0.2013  death
   0.2005  lawyer
   0.1995  noble
   0.1983  Who
   0.1956  When
   0.1926  lord
   0.1858  To
   0.1831  Thou
   0.1785  eye
   0.1750  o


# Run Best Model on Test Data -> Outputs yproba1__test.txt

In [316]:
# Train final model on all training data (already built in previous cell) and score x_test
# Assumes the following already exist from the previous cell:
# all_indices, final_vocabulary, final_vocab_index, X_all_train, y_tr_N, best_C
# Also assumes tokenize_text and transform_text_into_feature_vector are defined.

# 1) Load and tokenize x_test exactly like x_train
x_test_df = pd.read_csv("x_test.csv")
x_test_df["tokens"] = x_test_df["text"].astype(str).apply(tokenize_text)

# 2) Featurize x_test using the SAME final_vocab_index
num_test = len(x_test_df)
V = len(final_vocab_index)
X_all_test = np.zeros((num_test, V), dtype=int)
for row_pos in range(num_test):
    tokens_row = x_test_df.loc[row_pos, "tokens"]
    X_all_test[row_pos] = transform_text_into_feature_vector(tokens_row, final_vocab_index)

# 3) Fit final Logistic Regression and produce probabilities for leaderboard
final_clf = sklearn.linear_model.LogisticRegression(C=best_C, penalty="l2", solver="lbfgs", max_iter=5000, tol=1e-3, random_state=0)
final_clf.fit(X_all_train, y_tr_N)
yproba_test = final_clf.predict_proba(X_all_test)[:, 1]

# 4) Save to disk (one probability per line)
np.savetxt("yproba1_test.txt", yproba_test, fmt="%.6f")

print("Saved yproba1_test.txt", "| test shape:", yproba_test.shape, "| first 5:", yproba_test[:5])



Saved yproba1_test.txt | test shape: (1197,) | first 5: [0.8512277  0.88107835 0.84904821 0.79808213 0.88156887]
