In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pickle



In [None]:
df = pd.read_pickle('preprocessed_data.pkl')

X = df['cleaned_text'].values  
y = df['label'].values 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      
    random_state=42,    
    stratify=y          
) 

In [None]:
vectorizer = TfidfVectorizer(
    max_features=5000,    
    min_df=5,            
    max_df=0.7,           
    ngram_range=(1, 1)    
)

vectorizer.fit(X_train)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
sparsity = 1.0 - (X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))
vocabulary = vectorizer.vocabulary_
sample_words = list(vocabulary.items())[:20]
for word, idx in sample_words:
    print(f" '{word}' → index {idx}")


feature_names = vectorizer.get_feature_names_out()

#average 
mean_tfidf = np.array(X_train_tfidf.mean(axis=0)).flatten()
top_indices = mean_tfidf.argsort()[-20:][::-1]

 'response' → index 3774
 'lot' → index 2634
 'email' → index 1450
 'ive' → index 2367
 'gotten' → index 1929
 'need' → index 2953
 'position' → index 3355
 'favor' → index 1670
 'easter' → index 1402
 'aspect' → index 287
 'presently' → index 3404
 'egg' → index 1430
 'way' → index 4855
 'ending' → index 1474
 'fast' → index 1663
 'point' → index 3330
 'distinguish' → index 1321
 'intentionally' → index 2301
 'pagan' → index 3160
 'deity' → index 1180

 Computing average TF-IDF scores...


In [7]:
for idx in top_indices:
    word = feature_names[idx]
    score = mean_tfidf[idx]
    print(f"        {word:20s} : {score:.6f}")


        would                : 0.022839
        one                  : 0.021420
        know                 : 0.017333
        like                 : 0.016925
        get                  : 0.016751
        dont                 : 0.016497
        think                : 0.014360
        people               : 0.014208
        time                 : 0.012883
        thanks               : 0.012580
        use                  : 0.012555
        anyone               : 0.012526
        also                 : 0.012330
        could                : 0.012222
        good                 : 0.012151
        problem              : 0.011449
        window               : 0.011373
        year                 : 0.011308
        please               : 0.011295
        new                  : 0.010970


In [10]:
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

np.save('X_train_tfidf.npy', X_train_tfidf.toarray())
np.save('X_test_tfidf.npy', X_test_tfidf.toarray())
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)
with open('train_test_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train': X_train_tfidf,
        'X_test': X_test_tfidf,
        'y_train': y_train,
        'y_test': y_test,
        'feature_names': feature_names
    }, f)