# Loading Data


In [54]:
# Install Dependencies
import numpy as np
import pandas as pd
import re

import sklearn.linear_model
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.pipeline

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')


In [20]:
x_train_df = pd.read_csv('x_train.csv')
tr_list_of_text = x_train_df['text'].values.tolist()


x_train_df.head()          # first rows
x_train_df.info()          # column types & non-null counts
print(x_train_df['text'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   author                         5557 non-null   object 
 1   title                          5557 non-null   object 
 2   passage_id                     5557 non-null   int64  
 3   text                           5557 non-null   object 
 4   char_count                     5557 non-null   float64
 5   word_count                     5557 non-null   float64
 6   sentence_count                 5557 non-null   float64
 7   avg_word_length                5557 non-null   float64
 8   avg_sentence_length            5557 non-null   float64
 9   type_token_ratio               5557 non-null   float64
 10  pronoun_freq                   5557 non-null   float64
 11  function_words_count           5557 non-null   float64
 12  punctuation_frequency          5557 non-null   f

#### Declare tokenize text function from lab 10

In [48]:
def tokenize_text(raw_text):
    ''' Transform a plain-text string into a list of tokens
    
    We assume that *whitespace* divides tokens.
    
    Args
    ----
    raw_text : string
    
    Returns
    -------
    list_of_tokens : list of strings
        Each element is one token in the provided text
    '''
    list_of_tokens = raw_text.split() # split method divides on whitespace by default
    for pp in range(len(list_of_tokens)):
        cur_token = list_of_tokens[pp]
        # Remove punctuation
        for punc in ['?', '!', '_', '.', ',', '"', '/', '--']:
            cur_token = cur_token.replace(punc, "")
        # Turn to lower case
        clean_token = cur_token.lower()
        # Replace the cleaned token into the original list
        list_of_tokens[pp] = clean_token
    return list_of_tokens
     

### Add column "tokens" to x_train
#### - each row now contains a list of each tokenized word from that samples text

In [99]:
x_train_df["tokens"] = x_train_df["text"].astype(str).apply(tokenize_text)

In [100]:
for line in x_train_df['tokens'][:5]:
    print("\nTokenized text per sample:")
    print(line)  


Tokenized text per sample:
['yes', 'what', 'sort', 'of', 'terms', 'was', 'he', 'on', 'with', 'the', 'guests—you', 'and', 'miss', 'norris', 'and', 'all', 'of', 'them', 'just', 'polite', 'and', 'rather', 'silent', 'you', 'know', 'keeping', 'himself', 'to', 'himself', 'we', "didn't", 'see', 'so', 'very', 'much', 'of', 'him', 'except', 'at', 'meals', 'we', 'were', 'here', 'to', 'enjoy', 'ourselves', 'and—well', 'he', "wasn't", 'he', "wasn't", 'there', 'when', 'the', 'ghost', 'walked', 'no', 'i', 'heard', 'mark', 'calling', 'for', 'him', 'when', 'he', 'went', 'back', 'to', 'the', 'house', 'i', 'expect', 'cayley', 'stroked', 'down', 'his', 'feathers', 'a', 'bit', 'and', 'told', 'him', 'that', 'girls', 'will', 'be', 'girls—hallo', 'here', 'we', 'are']

Tokenized text per sample:
['perhaps', 'i', 'should', 'say', 'that', 'it', 'was', "mark's", 'private', 'plan', 'my', 'own', 'was', 'different', 'the', 'announcement', 'at', 'breakfast', 'went', 'well', 'after', 'the', 'golfing-party', 'had', '

In [101]:
token_count_dict = dict()

for row in x_train_df['tokens']:
    for token in row:
        if not token:   # skip empty tokens caused by punctuation stripping
            continue
        if token in token_count_dict:
            token_count_dict[token] += 1
        else:
            token_count_dict[token] = 1


sorted_tokens = list(sorted(token_count_dict, key=token_count_dict.get, reverse=True))

print("\n Number of unique Tokens: ")
print(len(sorted_tokens))

print("\n TOP 10 TOKENS (with count)")
for w in sorted_tokens[:10]:
    print(f"{token_count_dict[w]:5d} {w}")

print("\n BOTTOM 10 TOKENS:")
for w in sorted_tokens[-10:]:
    print(f"{token_count_dict[w]:5d} {w}")


 Number of unique Tokens: 
34791

 TOP 10 TOKENS (with count)
24175 the
14488 and
12029 of
11904 to
 9227 a
 6779 in
 6730 i
 5982 he
 5501 that
 5258 was

 BOTTOM 10 TOKENS:
    1 honorable;
    1 saucy
    1 brandied
    1 absinthe
    1 teased
    1 coquenard
    1 madinier
    1 hearse
    1 lumpy
    1 monumental


#### Create our finite list of vocab (only tokens that appear > 4 times)

In [102]:
vocab = [w for w in sorted_tokens ]#if token_count_dict[w] >= 4]
print(f"\n Vocab size (unfiltered)): {len(vocab)}")


 Vocab size (unfiltered)): 34791


# Featurizing Text

In [103]:
vocab_dict = dict()
for vocab_id, tok in enumerate(vocab):
    vocab_dict[tok] = vocab_id

In [106]:
def transform_text_into_feature_vector(token_list, vocab_dict):
    ''' Produce count feature vector for provided *tokenized* text (list of tokens) '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V, dtype=int)
    for token in token_list:
        if token in vocab_dict:
            vv = vocab_dict[token]
            count_V[vv] += 1
    return count_V


#### Test featurization function with common words

In [107]:
print(transform_text_into_feature_vector(x_train_df["tokens"].iloc[0], vocab_dict))


[3 4 3 3 1 0 2 4 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 1 1 3 0 0 0 0 1 0 0 0 0 0 1 1 0 0 3 1 1 0 0 1 2 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1
 0 0 0 0 0 0 2 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 

# Using Classifier

#### Load y_train labels

In [138]:
y_train_df = pd.read_csv('y_train.csv')

y_coarse = y_train_df.iloc[:, 3]  # Column 4 = 'Coarse Label'

# Map to {0,1}
label_map = {"Key Stage 2-3": 0, "Key Stage 4-5": 1}
y_tr_N = y_coarse.map(label_map).to_numpy()

# Sanity check
assert len(y) == len(x_train_df), "Label length mismatch"


In [139]:
import pandas as pd
import numpy as np

# 1) Load labels
y_train_df = pd.read_csv("y_train.csv")

# 2) Basic column checks
assert "Coarse Label" in y_train_df.columns, "Missing 'Coarse Label' in y_train.csv"
assert len(y_train_df) == len(x_train_df), "Row counts differ; are files aligned the same way?"

# 3) Peek unique coarse labels
print("Unique coarse labels in y_train:", y_train_df["Coarse Label"].unique())

# 4) Map to {0,1}
label_map = {"Key Stage 2-3": 0, "Key Stage 4-5": 1}
y_coarse = y_train_df.iloc[:, 3]            # 4th column = 'Coarse Label'
y_tr_N = y_coarse.map(label_map).to_numpy() # int labels

# 5) Sanity checks
assert y_tr_N.shape[0] == len(x_train_df), "Label length mismatch"
assert not np.isnan(y_tr_N).any(), "Found NaNs after mapping — unexpected label values?"

# 6) Class counts + preview
print("y_tr_N shape:", y_tr_N.shape)
print("Class counts:", (y_tr_N == 0).sum(), "(KS2-3),", (y_tr_N == 1).sum(), "(KS4-5)")
print("First 10 labels:", y_tr_N[:10])


Unique coarse labels in y_train: ['Key Stage 2-3' 'Key Stage 4-5']
y_tr_N shape: (5557,)
Class counts: 2509 (KS2-3), 3048 (KS4-5)
First 10 labels: [0 0 0 0 0 0 0 0 0 0]


In [140]:
# Build (N, V) feature matrix from the tokenized column
N = len(x_train_df)
V = len(vocab_dict)
x_tr_NV = np.zeros((N, V))

for nn, token_list in enumerate(x_train_df["tokens"]):
    x_tr_NV[nn] = transform_text_into_feature_vector(token_list, vocab_dict)

print(x_tr_NV.shape)  # (N, V)


(5557, 34791)


In [141]:
clf = sklearn.linear_model.LogisticRegression(C=1e-2, max_iter=5000, penalty="l2",solver="lbfgs") 
clf.fit(x_tr_NV, y_tr_N)

print("Training Accuracy:" , clf.score(x_tr_NV, y_tr_N))

Training Accuracy: 0.776858016915602


In [142]:
import numpy as np

# Ensure vocab_list is ordered by column index (0..V-1)
# (skip if you already have a correctly ordered list)
vocab_list = [None]*len(vocab_dict)
for tok, j in vocab_dict.items():
    vocab_list[j] = tok

# Get weights (binary LR)
weights_V = clf.coef_[0]

# Sort ids by weight (ascending = most negative first)
sorted_tok_ids_V = np.argsort(weights_V)

# Print bottom K (most negative → push class 0)
K = 25
print("\n=== Most negative weights (class 0) ===")
for vv in sorted_tok_ids_V[:K]:
    print("% 9.4f  %s" % (weights_V[vv], vocab_list[vv]))

# Print top K (most positive → push class 1)
print("\n=== Most positive weights (class 1) ===")
for vv in sorted_tok_ids_V[-K:][::-1]:
    print("% 9.4f  %s" % (weights_V[vv], vocab_list[vv]))





=== Most negative weights (class 0) ===
  -0.1869  robin
  -0.1865  philip
  -0.1786  talk
  -0.1759  asked
  -0.1750  said
  -0.1725  mr
  -0.1669  bambi
  -0.1664  myles
  -0.1642  oh
  -0.1525  pinocchio
  -0.1512  over
  -0.1497  white
  -0.1483  don't
  -0.1443  it
  -0.1413  she
  -0.1411  minutes
  -0.1407  mrs
  -0.1398  one
  -0.1397  doctor
  -0.1387  you're
  -0.1373  alice
  -0.1332  right
  -0.1318  years
  -0.1316  margaret
  -0.1315  dick

=== Most positive weights (class 1) ===
   0.2245  which
   0.1897  sir
   0.1467  julien
   0.1426  for
   0.1384  therefore
   0.1365  or
   0.1327  'i
   0.1296  thou
   0.1273  k
   0.1227  as
   0.1205  should
   0.1169  unto
   0.1157  whom
   0.1124  trina
   0.1112  almost
   0.1112  of
   0.1089  when
   0.1070  hath
   0.1065  shall
   0.1059  husband
   0.1057  by
   0.1033  o
   0.1026  jurgis
   0.1007  from
   0.0958  our


In [143]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_tr_N, clf.predict(X_bow_csr if 'X_bow_csr' in globals() else x_tr_NV)))


[[1759  750]
 [ 490 2558]]
