<a href="https://colab.research.google.com/github/sophia-moore/232-Final-Project/blob/main/nlp_processing_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Data Loading and Exploration

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
# Read the pickle file
file_path = r"C:\Users\gpapa\OneDrive\My Life\Education\2025 YALE MMS\Term_4S\T4_E_Adv_Lin_Algebra_p2\final_project\DJN_2017-01.pkl"
with open(file_path, 'rb') as f:
    data = pickle.load(f)

  data = pickle.load(f)


In [None]:
# Display basic information about the dataset
print(type(data))
if isinstance(data, pd.DataFrame):
    print(data.head())
    print(data.info())

<class 'pandas.core.frame.DataFrame'>
       Date   Company                                              Title  \
0  20170101  [ATC.AE]  [press, release, murphy, owner, kxly, abc, cha...   
1  20170101    [ARKR]  [ark, restaurant, apos, ceo, weinstein, result...   
2  20170101       [F]  [like, futurist, be, prepare, totally, unexpec...   
3  20170101    [NCOM]  [press, release, national, commerce, corporati...   
4  20170101     [EDE]  [press, release, district, electric, company, ...   

                                             Article  Compound_Return  \
0  [remove, it, programming, wire, despite, willi...              NaN   
1                             [from, seek, earnings]         0.070817   
2  [by, in, resident, futurist, lead, team, imagi...         0.076860   
3  [part, family, globe, ncc, parent, company, he...        -0.020161   
4  [wire, closing, today, merger, company, subsid...              NaN   

   Day1_Return  
0          NaN  
1     0.008468  
2     0.046068 

In [None]:
# Calculate average length of title lists
avg_title_length_list = data['Title'].apply(len).mean()
print(f"Average length of titles: {avg_title_length_list:.2f} words")


Average length of titles: 9.08 words


In [None]:
# Calculate average length of ARTICLE lists
avg_article_length_list = data['Article'].apply(len).mean()
print(f"Average length of Articles: {avg_article_length_list:.2f} words")

Average length of Articles: 210.09 words


In [None]:
unique_dates = data['Date'].value_counts()
unique_dates.sort_index(ascending=True, inplace=True)


In [None]:
unique_dates
# our dates span from 2017-01-01 to 2017-01-31 => ONE MONTH

Date
20170101       6
20170102      47
20170103    1340
20170104    2023
20170105    1795
20170106     988
20170107      29
20170108      42
20170109    1136
20170110    1260
20170111    1102
20170112    1185
20170113     790
20170114      20
20170115      35
20170116     348
20170117    1376
20170118    1402
20170119    1510
20170120     832
20170121      14
20170122      43
20170123    1124
20170124    1424
20170125    1562
20170126    1744
20170127    1058
20170128      21
20170129      40
20170130    1367
20170131    1750
Name: count, dtype: int64

#### Bag of Words (BoW) representation

In [None]:
# Convert lists to strings in the Article column
data['Article_text'] = data['Article'].apply(' '.join)

# Step 1: Create document-term matrix
print("Creating document-term matrix...")
vectorizer = CountVectorizer(
    max_df=0.95,  # Remove terms that appear in >95% of documents
    min_df=2,     # Remove terms that appear in <2 documents
    stop_words='english'
)
doc_term_matrix = vectorizer.fit_transform(data['Article_text'])


Creating document-term matrix...


In [None]:
print("Document-term matrix created. Shape:", doc_term_matrix.shape)
print("Output – a SciPy sparse matrix shape = (n_docs, n_unique_terms) whose (i, j) entry is the raw count of term j in document i.")

Document-term matrix created. Shape: (27413, 26617)
Output – a SciPy sparse matrix shape = (n_docs, n_unique_terms) whose (i, j) entry is the raw count of term j in document i.


In [None]:
# Convert to dense array and show first few rows and columns
print("First 10 rows and 10 columns of the document-term matrix:")
print(doc_term_matrix[:10, :10].toarray())

# Show some basic statistics
print("\nMatrix shape:", doc_term_matrix.shape)
print("Number of non-zero elements:", doc_term_matrix.nnz)
print("Sparsity: {:.2%}".format(1 - doc_term_matrix.nnz / (doc_term_matrix.shape[0] * doc_term_matrix.shape[1])))

First 10 rows and 10 columns of the document-term matrix:
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]

Matrix shape: (27413, 26617)
Number of non-zero elements: 2710110
Sparsity: 99.63%


In [None]:
# Convert sparse matrix to DataFrame
df_bag_of_words = pd.DataFrame(
	doc_term_matrix.toarray(),
	columns=vectorizer.get_feature_names_out()
)

# Save to CSV
df_bag_of_words.to_csv('bag_of_words_doc_termcount_matrix.csv')
print("Document-term matrix saved to bag_of_words_doc_termcount_matrix.csv")

Document-term matrix saved to bag_of_words_doc_termcount_matrix.csv


#### Simple Average GloVe representation
A bag-of-words matrix captures frequency but not semantics—“car” and “automobile” are orthogonal. Word-vector models like GloVe collapse words into dense real-valued vectors where geometric proximity encodes meaning.

In [None]:
from collections import Counter, defaultdict

In [None]:

# --- 1. Document-Frequency counts -------------------------------------------
N = len(data['Article'])                                # number of articles
doc_freq_counts = Counter()

for tokens in data['Article']:
    doc_freq_counts.update(set(tokens))                       # set(tokens) removes duplicate tokens per doc

# --- 2. Build the keep-set ---------------------------------------------------
min_df = 2                                              # remove words that appear in <2 docs
max_df = 0.95 * N                                       # remove words that appear in >95 % of all docs
tokens_to_keep = {tok for tok, df in doc_freq_counts.items()
                  if min_df <= df <= max_df}

# --- 3. Filter each article --------------------------------------------------
def filter_tokens(tokens, keep_set=tokens_to_keep):
    return [t for t in tokens if t in keep_set]

data['Article_winsorized'] = data['Article'].apply(filter_tokens)

In [None]:
# helper function to load GloVe embeddings
def load_glove(path):
    embeddings = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

In [None]:
# 1. Load pre-trained GloVe (you’ll need the txt file → dict{word:vector})
glove = load_glove(r"C:\Users\gpapa\OneDrive\My Life\Education\2025 YALE MMS\Term_4S\T4_E_Adv_Lin_Algebra_p2\final_project\glove.6B.100d.txt")
embedding_dim = 100
UNK = np.zeros(embedding_dim)            # vector for out-of-vocabulary words

def embed_article_avg(tokens):
    vecs = [glove.get(tok, UNK) for tok in tokens]
    return UNK if not vecs else np.mean(vecs, axis=0)         # simple average; or try tf-idf weighting

# each Article_vec value is a emantic embedding of one article
data['Article_vec'] = data['Article_winsorized'].apply(embed_article_avg)
glove_doc_embedding_simple_avg_matrix = np.vstack(data['Article_vec'].values)

print("GloVe document embedding matrix shape:", glove_doc_embedding_simple_avg_matrix.shape)
print("Output – a NumPy array shape = (n_docs, n_embedding_dim) whose (i,j) entry is the simple avg embedding of characteristic (j) in document (i).")

GloVe document embedding matrix shape: (27413, 100)
Output – a NumPy array shape = (n_docs, n_embedding_dim) whose (i,j) entry is the simple avg embedding of characteristic in the i-th document.


In [None]:
# Convert glove embedding matrix to DataFrame
df_glove_avg = pd.DataFrame(
    glove_doc_embedding_simple_avg_matrix,
    columns=[f'dim_{i}' for i in range(embedding_dim)]
)

# Save to CSV
df_glove_avg.to_csv('glove_doc_embedding_simple_avg_matrix.csv')
print("GloVe document embedding matrix saved to glove_doc_embedding_simple_avg_matrix.csv")

GloVe document embedding matrix saved to glove_doc_embedding_simple_avg_matrix.csv


In [None]:
print("First 5 rows and 10 columns of the glove_doc_embedding_simple_avg_matrix:")
display(glove_doc_embedding_simple_avg_matrix[:5, :10])

First 5 rows and 10 columns of the glove_doc_embedding_simple_avg_matrix:


array([[-0.06428995,  0.03066079,  0.1261007 , -0.1087409 ,  0.06986396,
        -0.2436581 , -0.14276559,  0.17012586, -0.0204917 ,  0.09696526],
       [ 0.23832965,  0.16189666,  0.35848665, -0.09775668,  0.31639433,
        -0.39629331, -0.3297292 , -0.15749334, -0.43084002, -0.07729667],
       [-0.07722157,  0.23889745,  0.24618903, -0.04055617, -0.10148753,
        -0.04471661, -0.12110174,  0.00650583,  0.02334156,  0.02951091],
       [-0.05719619,  0.00981926,  0.16390821, -0.01661636,  0.01605625,
        -0.17516939, -0.24395782,  0.00443692, -0.02972875,  0.01264667],
       [-0.02815588,  0.04617887,  0.13838558,  0.07190471,  0.09651767,
        -0.26731699, -0.14145903,  0.11651946, -0.139832  ,  0.03287467]])

#### Full TF-IDF–Weighted GloVe Implementation


In [None]:
# --- 5. Precompute IDF values -----------------------------------------------
idf = {}
for token in tokens_to_keep:
    df = doc_freq_counts[token]
    idf[token] = np.log(1 + N / (1 + df))  # 1 + () -> smoothed IDF

# --- 6. TF-IDF weighted document embedding ----------------------------------
def embed_article_tfidf(tokens):
    if not tokens:
        return UNK

    tf_counts = Counter(tokens)
    total_tokens = len(tokens)

    tfidfs = []
    tfidf_scaled_word_vecs = []

    for tok in tokens:
        vec = glove.get(tok, UNK)
        tf_t = tf_counts[tok] / total_tokens
        idf_val = idf.get(tok, 0.0)  # if not in IDF, weight is 0
        tfidf = tf_t * idf_val
        tfidfs.append(tfidf)
        tfidf_scaled_word_vecs.append(vec * tfidf)

    if sum(tfidfs) == 0:
        return UNK

    return np.sum(tfidf_scaled_word_vecs, axis=0) / sum(tfidfs)  # normalize word vecs

In [None]:
# --- 7. Apply to each article -----------------------------------------------
data['Article_vec_tfidf'] = data['Article_winsorized'].apply(embed_article_tfidf)
glove_doc_embedding_tfidf_matrix = np.vstack(data['Article_vec_tfidf'].values)

# --- 8. Print output dimensions ---------------------------------------------
print("TF-IDF weighted GloVe embedding matrix shape:", glove_doc_embedding_tfidf_matrix.shape)
print("Output – a NumPy array shape = (n_docs, n_embedding_dim) whose (i,j) entry is the tfidf embedding of characteristic (j) in document (i)")

TF-IDF weighted GloVe embedding matrix shape: (27413, 100)
Output – a NumPy array shape = (n_docs, n_embedding_dim) whose (i,j) entry is the tfidf embedding of characteristic (j) in document (i)


In [None]:
# Convert glove embedding matrix to DataFrame
df_glove_tfidf = pd.DataFrame(
    glove_doc_embedding_tfidf_matrix,
    columns=[f'dim_{i}' for i in range(embedding_dim)]
)

# Save to CSV
df_glove_tfidf.to_csv('glove_doc_embedding_tfidf_matrix.csv')
print("GloVe document embedding matrix saved to glove_doc_embedding_tfidf_matrix.csv")

GloVe document embedding matrix saved to glove_doc_embedding_tfidf_matrix.csv


In [None]:
print("First 5 rows and 10 columns of the glove_doc_embedding_tfidf_matrix:")
display(glove_doc_embedding_tfidf_matrix[:5, :10])

First 5 rows and 10 columns of the glove_doc_embedding_tfidf_matrix:


array([[-0.01056913, -0.1645497 ,  0.08856896, -0.19300572,  0.11462332,
        -0.31052367, -0.06548257,  0.3515911 ,  0.02080597,  0.25400727],
       [ 0.25694836,  0.16810628,  0.35881328, -0.09819148,  0.34195528,
        -0.30055203, -0.34090286, -0.12114107, -0.3979063 , -0.04255791],
       [-0.02555832,  0.32649209,  0.19000133, -0.03878729, -0.09883531,
        -0.0310003 , -0.00478835, -0.19458799,  0.04868513,  0.1188786 ],
       [ 0.00699697, -0.13084813,  0.17774754, -0.08311283, -0.01034015,
        -0.20739683, -0.2062101 , -0.03980132, -0.04079613,  0.03588582],
       [ 0.00395626, -0.04470648,  0.11056613,  0.0294033 ,  0.16738058,
        -0.27139082, -0.09252032,  0.09953778, -0.19107341, -0.03167852]])