# Basic Model: Logistic Regression with SpaCy tokenization and CountVectorizer


## Data Loading and Preprocessing

In [None]:
!unzip pii-detection-removal-from-educational-data.zip

Archive:  pii-detection-removal-from-educational-data.zip
  inflating: sample_submission.csv   
  inflating: test.json               
  inflating: train.json              


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
with open('train.json', 'r') as file:
    data = json.load(file)

# Convert data to DataFrame
df = pd.DataFrame(data)

# Display data structure
print(df.head())

# Extract token and label lists from the data
def extract_tokens_labels(data):
    tokens = []
    labels = []
    for entry in data:
        tokens.extend(entry['tokens'])
        labels.extend(entry['labels'])
    return tokens, labels

tokens, labels = extract_tokens_labels(data)

# Split data into training and validation sets
tokens_train, tokens_val, labels_train, labels_val = train_test_split(
    tokens, labels, test_size=0.2, random_state=42
)

   document                                          full_text  \
0         7  Design Thinking for innovation reflexion-Avril...   
1        10  Diego Estrada\n\nDesign Thinking Assignment\n\...   
2        16  Reporting process\n\nby Gilberto Gamboa\n\nCha...   
3        20  Design Thinking for Innovation\n\nSindy Samaca...   
4        56  Assignment:  Visualization Reflection  Submitt...   

                                              tokens  \
0  [Design, Thinking, for, innovation, reflexion,...   
1  [Diego, Estrada, \n\n, Design, Thinking, Assig...   
2  [Reporting, process, \n\n, by, Gilberto, Gambo...   
3  [Design, Thinking, for, Innovation, \n\n, Sind...   
4  [Assignment, :,   , Visualization,  , Reflecti...   

                                 trailing_whitespace  \
0  [True, True, True, True, False, False, True, F...   
1  [True, False, False, True, True, False, False,...   
2  [True, False, False, True, True, False, False,...   
3  [True, True, True, False, False, True, 

## SpaCy for Tokenization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming `tokens_train` and `tokens_val` are lists of tokens from the training and validation splits
vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False, stop_words=None)
X_train = vectorizer.fit_transform(tokens_train)
X_val = vectorizer.transform(tokens_val)



## Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import fbeta_score

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(labels_train)
y_val = label_encoder.transform(labels_val)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

## Evaluation

In [None]:
# Check unique labels in training and validation sets
unique_train_labels = set(y_train)
unique_val_labels = set(y_val)

print("Unique labels in training set:", unique_train_labels)
print("Unique labels in validation set:", unique_val_labels)

# Check the number of labels and their names
print("Number of labels:", len(label_encoder.classes_))
print("Label names:", label_encoder.classes_)

Unique labels in training set: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
Unique labels in validation set: {0, 1, 2, 3, 5, 6, 8, 9, 10, 12}
Number of labels: 13
Label names: ['B-EMAIL' 'B-ID_NUM' 'B-NAME_STUDENT' 'B-PHONE_NUM' 'B-STREET_ADDRESS'
 'B-URL_PERSONAL' 'B-USERNAME' 'I-ID_NUM' 'I-NAME_STUDENT' 'I-PHONE_NUM'
 'I-STREET_ADDRESS' 'I-URL_PERSONAL' 'O']


In [None]:
from sklearn.metrics import classification_report

# Predict on validation set
y_pred = model.predict(X_val)

# All possible label IDs (as numerical IDs)
all_label_ids = range(len(label_encoder.classes_))

# Corresponding names for all labels
target_names = [label_encoder.inverse_transform([label_id])[0] for label_id in all_label_ids]

# Now print the classification report with all labels specified
print(classification_report(y_val, y_pred, labels=all_label_ids, target_names=target_names))

fbeta = fbeta_score(y_val, y_pred, beta=5, average='micro')
print(f"F-beta score with beta=5: {fbeta}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                  precision    recall  f1-score   support

         B-EMAIL       0.80      1.00      0.89         8
        B-ID_NUM       0.94      0.88      0.91        17
  B-NAME_STUDENT       0.00      0.00      0.00       267
     B-PHONE_NUM       0.00      0.00      0.00         1
B-STREET_ADDRESS       0.00      0.00      0.00         0
  B-URL_PERSONAL       0.59      0.56      0.57        18
      B-USERNAME       0.00      0.00      0.00         2
        I-ID_NUM       0.00      0.00      0.00         0
  I-NAME_STUDENT       0.00      0.00      0.00       218
     I-PHONE_NUM       0.00      0.00      0.00         2
I-STREET_ADDRESS       0.00      0.00      0.00         3
  I-URL_PERSONAL       0.00      0.00      0.00         0
               O       1.00      1.00      1.00    997971

       micro avg       1.00      1.00      1.00    998507
       macro avg       0.26      0.26      0.26    998507
    weighted avg       1.00      1.00      1.00    998507

F-beta scor

In [None]:
import numpy as np

# Check where predictions are wrong
errors = np.where(y_val != y_pred)[0]
print("Sample misclassifications:")
for error in errors[:10]:  # show first 10 errors
    print(f"Token: {tokens_val[error]}, True: {label_encoder.inverse_transform([y_val[error]])}, Pred: {label_encoder.inverse_transform([y_pred[error]])}")

Sample misclassifications:
Token: Narayn, True: ['I-NAME_STUDENT'], Pred: ['O']
Token: Sarah, True: ['B-NAME_STUDENT'], Pred: ['O']
Token: Willian, True: ['B-NAME_STUDENT'], Pred: ['O']
Token: Monica, True: ['B-NAME_STUDENT'], Pred: ['O']
Token: Ahmed, True: ['B-NAME_STUDENT'], Pred: ['O']
Token: https://www.changemakers.com/youthventure/resources/rootcause, True: ['O'], Pred: ['B-URL_PERSONAL']
Token: Luis, True: ['B-NAME_STUDENT'], Pred: ['O']
Token: IV-8322, True: ['B-ID_NUM'], Pred: ['O']
Token: Aakash, True: ['B-NAME_STUDENT'], Pred: ['O']
Token: Richter, True: ['I-NAME_STUDENT'], Pred: ['O']


### Evaluation Part 2
The previous evaluation is a useless evaluation, because it looks at all the ones without O's. Let's now check the evaluation considering only labels that are not 'O'.

In [None]:
from sklearn.metrics import accuracy_score

# Convert label IDs back to their string representations
labels_true = label_encoder.inverse_transform(y_val)
labels_pred = label_encoder.inverse_transform(y_pred)

# Filter out 'O' labels to focus on PII relevant predictions
non_o_mask = labels_true != 'O'
non_o_true = labels_true[non_o_mask]
non_o_pred = labels_pred[non_o_mask]

# Calculate the accuracy for non-O labels
non_o_accuracy = accuracy_score(non_o_true, non_o_pred)
print(f"Accuracy for non-'O' labels: {non_o_accuracy}")

# Generate and print classification report for non-O labels
print(classification_report(non_o_true, non_o_pred))

Accuracy for non-'O' labels: 0.061567164179104475
                  precision    recall  f1-score   support

         B-EMAIL       1.00      1.00      1.00         8
        B-ID_NUM       0.94      0.88      0.91        17
  B-NAME_STUDENT       0.00      0.00      0.00       267
     B-PHONE_NUM       0.00      0.00      0.00         1
  B-URL_PERSONAL       1.00      0.56      0.71        18
      B-USERNAME       0.00      0.00      0.00         2
  I-NAME_STUDENT       0.00      0.00      0.00       218
     I-PHONE_NUM       0.00      0.00      0.00         2
I-STREET_ADDRESS       0.00      0.00      0.00         3
               O       0.00      0.00      0.00         0

        accuracy                           0.06       536
       macro avg       0.29      0.24      0.26       536
    weighted avg       0.08      0.06      0.07       536



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
