In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
UPDATED_TRAIN_CSV = '/content/drive/MyDrive/440_proj/EDA_outputs/uptaded_train_parsed.csv'
UPDATED_TEST_CSV = '/content/drive/MyDrive/440_proj/EDA_outputs/uptaded_test_parsed.csv'

In [None]:
# !pip install gensim

# Imports

In [1]:
import re
import os
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# NN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from tqdm import tqdm
import pickle
import kagglehub

## Download resources & Initialize

In [None]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("wordnet")

stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# General Preprocessing (phase 1 and 2)

## Utility Functions



## Track A: Light Preprocessing (for embeddings: GloVe, Skip-gram)

In [None]:
def preprocess_light(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)  # remove extra spaces/newlines

    return text.strip()

## Track B: Classical Preprocessing (for BoW, TF-IDF)


In [None]:
def preprocess_classical(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # keep only letters
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]  # remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]  # lemmatization
    return " ".join(tokens)

# Apply preprocessing to updated train/test sets


In [None]:
updated_train_df = pd.read_csv(UPDATED_TRAIN_CSV)
updated_test_df = pd.read_csv(UPDATED_TEST_CSV)

for df in [updated_train_df, updated_test_df]:
    df["clean_text_basic"] = df["full_text"].apply(preprocess_light)     # Track A
    df["clean_text_classical"] = df["full_text"].apply(preprocess_classical)  # Track B

# Quick peek
updated_train_df[["full_text", "clean_text_basic", "clean_text_classical"]].head()


Unnamed: 0,full_text,clean_text_basic,clean_text_classical
0,i am good at web design amateurly. how can i e...,i am good at web design amateurly. how can i e...,good web design amateurly earn money job web d...
1,My daughter wants to be a Medical Illustrator....,my daughter wants to be a medical illustrator....,daughter want medical illustrator anyone know ...
2,Is brother Jesus our brother or our King? the...,is brother jesus our brother or our king? the ...,brother jesus brother king one id say he broth...
3,what is I-20? students need I 20 for being adm...,what is i-20? students need i 20 for being adm...,student need admitted university hell earth me...
4,what is a data disk? I heard that it is possib...,what is a data disk? i heard that it is possib...,data disk heard possible save file data disk e...


## Quick comparison of preprocessing tracks (not important)


In [None]:
sample_rows = updated_train_df.sample(3, random_state=42)

for i, row in sample_rows.iterrows():
    print("="*80)
    print(f"Original full_text:\n{row['full_text']}\n")
    print(f"Clean (Track A - light):\n{row['clean_text_basic']}\n")
    print(f"Clean (Track B - classical):\n{row['clean_text_classical']}\n")


Original full_text:
I feel like there is always someone standing, watching me. how do I tap into my phychic ability to findout who I'm not paranoid or crazy.  I see things go by out of the corner of my eye and there's nothing there. Leave things where they are and belive in the good!!!

Clean (Track A - light):
i feel like there is always someone standing, watching me. how do i tap into my phychic ability to findout who i'm not paranoid or crazy. i see things go by out of the corner of my eye and there's nothing there. leave things where they are and belive in the good!!!

Clean (Track B - classical):
feel like always someone standing watching tap phychic ability findout im paranoid crazy see thing go corner eye there nothing leave thing belive good

Original full_text:
Could anyone pls invite me to gmail.com? forgot to get email-  email it to timpaulking@yahoo.com i invited u\naccept it

Clean (Track A - light):
could anyone pls invite me to gmail.com? forgot to get email- email it to

In [None]:
updated_train_df.head()

Unnamed: 0,QA Text,Class,question_title,question_content,best_answer,full_text,question_title_char_count,question_title_word_count,question_title_sent_count,question_content_char_count,...,question_content_lexical_diversity,best_answer_lexical_diversity,full_text_lexical_diversity,has_url,has_email,has_html,num_qmarks,num_exclaims,clean_text_basic,clean_text_classical
0,Question Title:\ni am good at web design amate...,Business & Finance,i am good at web design amateurly. how can i e...,web design,Hey! Re. Job. I tried to e-mail you using the...,i am good at web design amateurly. how can i e...,74,16,2,10,...,1.0,0.754386,0.773333,False,True,False,1,1,i am good at web design amateurly. how can i e...,good web design amateurly earn money job web d...
1,Question Title:\nMy daughter wants to be a Med...,Education & Reference,My daughter wants to be a Medical Illustrator....,What do they do for sure. I would like to be ...,Medical Illustrator\nMedical Illustrator. Medi...,My daughter wants to be a Medical Illustrator....,78,14,2,122,...,0.92,0.640244,0.660099,True,False,True,3,0,my daughter wants to be a medical illustrator....,daughter want medical illustrator anyone know ...
2,Question Title:\nIs brother Jesus our brother ...,Society & Culture,Is brother Jesus our brother or our King?,,"the all being one, i'd say he's our brother. y...",Is brother Jesus our brother or our King? the...,41,8,1,0,...,0.0,1.0,0.863636,False,False,False,1,0,is brother jesus our brother or our king? the ...,brother jesus brother king one id say he broth...
3,Question Title:\nwhat is I-20?\nQuestion Conte...,Education & Reference,what is I-20?,students need I 20 for being admitted in Unive...,an I 20 is a form that the university will giv...,what is I-20? students need I 20 for being adm...,13,3,1,94,...,0.842105,0.8,0.677419,False,False,False,2,0,what is i-20? students need i 20 for being adm...,student need admitted university hell earth me...
4,Question Title:\nwhat is a data disk?\nQuestio...,Computers & Internet,what is a data disk?,I heard that it is possible to save files on a...,"i think it's a usb flashdrive. a small, light...",what is a data disk? I heard that it is possib...,20,5,1,210,...,0.818182,0.837209,0.717391,False,False,False,2,0,what is a data disk? i heard that it is possib...,data disk heard possible save file data disk e...


# Uncomment the cell bellow Export the Preprocessed CSVs

In [None]:
# OUT_DIR = "preprocess_outputs"
# os.makedirs(OUT_DIR, exist_ok=True)
# updated_train_df.to_csv(os.path.join(OUT_DIR, "uptaded_train_preprocessed.csv"), index=False)
# updated_test_df.to_csv(os.path.join(OUT_DIR, "uptaded_test_preprocessed.csv"), index=False)
# print("Saved parsed train to", os.path.join(OUT_DIR, "uptaded_train_parsed.csv"))
# print("Saved parsed test to", os.path.join(OUT_DIR, "uptaded_test_parsed.csv"))

Saved parsed train to preprocess_outputs/uptaded_train_parsed.csv
Saved parsed test to preprocess_outputs/uptaded_test_parsed.csv


In [None]:
# u = pd.read_csv(os.path.join(OUT_DIR, "uptaded_train_preprocessed.csv"))
# u.head()

Unnamed: 0,QA Text,Class,question_title,question_content,best_answer,full_text,question_title_char_count,question_title_word_count,question_title_sent_count,question_content_char_count,...,question_content_lexical_diversity,best_answer_lexical_diversity,full_text_lexical_diversity,has_url,has_email,has_html,num_qmarks,num_exclaims,clean_text_basic,clean_text_classical
0,Question Title:\ni am good at web design amate...,Business & Finance,i am good at web design amateurly. how can i e...,web design,Hey! Re. Job. I tried to e-mail you using the...,i am good at web design amateurly. how can i e...,74,16,2,10,...,1.0,0.754386,0.773333,False,True,False,1,1,i am good at web design amateurly. how can i e...,good web design amateurly earn money job web d...
1,Question Title:\nMy daughter wants to be a Med...,Education & Reference,My daughter wants to be a Medical Illustrator....,What do they do for sure. I would like to be ...,Medical Illustrator\nMedical Illustrator. Medi...,My daughter wants to be a Medical Illustrator....,78,14,2,122,...,0.92,0.640244,0.660099,True,False,True,3,0,my daughter wants to be a medical illustrator....,daughter want medical illustrator anyone know ...
2,Question Title:\nIs brother Jesus our brother ...,Society & Culture,Is brother Jesus our brother or our King?,,"the all being one, i'd say he's our brother. y...",Is brother Jesus our brother or our King? the...,41,8,1,0,...,0.0,1.0,0.863636,False,False,False,1,0,is brother jesus our brother or our king? the ...,brother jesus brother king one id say he broth...
3,Question Title:\nwhat is I-20?\nQuestion Conte...,Education & Reference,what is I-20?,students need I 20 for being admitted in Unive...,an I 20 is a form that the university will giv...,what is I-20? students need I 20 for being adm...,13,3,1,94,...,0.842105,0.8,0.677419,False,False,False,2,0,what is i-20? students need i 20 for being adm...,student need admitted university hell earth me...
4,Question Title:\nwhat is a data disk?\nQuestio...,Computers & Internet,what is a data disk?,I heard that it is possible to save files on a...,"i think it's a usb flashdrive. a small, light...",what is a data disk? I heard that it is possib...,20,5,1,210,...,0.818182,0.837209,0.717391,False,False,False,2,0,what is a data disk? i heard that it is possib...,data disk heard possible save file data disk e...


# Preporcessing for NN (Phase 2)

## Config

In [7]:
MAX_NUM_WORDS = None   # limit vocab size
MAX_SEQUENCE_LENGTH = 200  # padding length
EMBEDDING_DIM = 200     # using 200d for GloVe

os.makedirs("artifacts", exist_ok=True)

## Tokenization

In [4]:
PREPROCESSED_TRAIN_CSV = '/content/drive/MyDrive/440_proj/Preprocess_outputs/uptaded_train_preprocessed.csv'
PREPROCESSED_TEST_CSV = '/content/drive/MyDrive/440_proj/Preprocess_outputs/uptaded_test_preprocessed.csv'

preprocessed_train_df = pd.read_csv(PREPROCESSED_TRAIN_CSV)
preprocessed_test_df = pd.read_csv(PREPROCESSED_TEST_CSV)

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Fit tokenizer on your clean_text_basic (or whichever variant you prefer)
train_texts = preprocessed_train_df["clean_text_basic"].astype(str).tolist()
tokenizer = Tokenizer(num_words=None, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Inspect vocab size
vocab_size = len(tokenizer.word_index)
print(f"Full vocabulary size: {vocab_size:,}")

# Decide on MAX_NUM_WORDS
if vocab_size <= 100_000:
    MAX_NUM_WORDS = vocab_size + 1  # +1 for OOV/padding
else:
    MAX_NUM_WORDS = 100_000  # cap to keep things efficient
    print(f"Vocab capped at {MAX_NUM_WORDS} most frequent words.")


Full vocabulary size: 373,160
Vocab capped at 100000 most frequent words.


In [9]:
print(MAX_NUM_WORDS)

100000


In [10]:
def prepare_tokenizer(texts, max_words=MAX_NUM_WORDS):
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    return tokenizer


train_texts = preprocessed_train_df["clean_text_basic"].astype(str).tolist()
test_texts = preprocessed_test_df["clean_text_basic"].astype(str).tolist()

tokenizer = prepare_tokenizer(train_texts)

# convert texts to padded sequences
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=MAX_SEQUENCE_LENGTH)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=MAX_SEQUENCE_LENGTH)

# Save tokenizer
with open("artifacts/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Tokenizer + sequences ready")
print("Train shape:", X_train_seq.shape, " Test shape:", X_test_seq.shape)

KeyboardInterrupt: 

## Load GloVe embeddings

In [None]:
from google.colab import files
files.upload()

Saving kaggle (1).json to kaggle (1).json


{'kaggle (1).json': b'{"username":"saminhaqueschool","key":"b53715ec44c058d3e98491f152a2e9be"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
def load_glove_embeddings(glove_path, word_index, embedding_dim=EMBEDDING_DIM, max_words=MAX_NUM_WORDS):
    embeddings_index = {}
    with open(glove_path, encoding="utf8") as f:
        for line in tqdm(f, desc="Loading GloVe"):
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs

    print(f"Found {len(embeddings_index)} word vectors in GloVe.")

    # build embedding matrix
    embedding_matrix = np.random.normal(size=(max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            vec = embeddings_index.get(word)
            if vec is not None:
                embedding_matrix[i] = vec
    return embedding_matrix


glove_download = kagglehub.dataset_download("incorpes/glove6b200d")
glove_path = glove_download + "/glove.6B.200d.txt"
print("GloVe file downloaded to:", glove_path)
# glove_path = "../artifacts/glove.6B.200d.txt"
glove_embedding_matrix = load_glove_embeddings(glove_path, tokenizer.word_index)

# Save
np.save("artifacts/glove_embedding_matrix.npy", glove_embedding_matrix)

Using Colab cache for faster access to the 'glove6b200d' dataset.
GloVe file downloaded to: /kaggle/input/glove6b200d/glove.6B.200d.txt


Loading GloVe: 400000it [00:27, 14364.96it/s]


Found 400000 word vectors in GloVe.


## Train Skip-gram (Word2Vec)

In [None]:
sentences = [text.split() for text in train_texts]


w2v_model = Word2Vec(
    sentences,
    vector_size=EMBEDDING_DIM,
    window=5,
    sg=1,
    min_count=2,
    workers=4
)

w2v_embedding_matrix = np.random.normal(size=(MAX_NUM_WORDS, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < MAX_NUM_WORDS:
        if word in w2v_model.wv:
            w2v_embedding_matrix[i] = w2v_model.wv[word]

# Save
w2v_model.save("artifacts/skipgram_word2vec.model")
np.save("artifacts/skipgram_embedding_matrix.npy", w2v_embedding_matrix)

print("Embedding matrices ready (GloVe + Skip-gram)")


Embedding matrices ready (GloVe + Skip-gram)
