In [9]:
import pandas as pd
#pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer

import re
#from numpy import triu
#from scipy.linalg import triu
from gensim.models import Word2Vec
import openai
from sklearn.svm import SVC
from secret_key import openai_key
from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dsta0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
df_tweets = pd.read_csv('training.1600000.processed.noemoticon.csv', 
                    encoding='latin-1',  names=['target','ids','date','flag','user','text'])

In [12]:
df_tweets.sample(10)

Unnamed: 0,target,ids,date,flag,user,text
301719,0,1998757599,Mon Jun 01 19:04:34 PDT 2009,NO_QUERY,LavishAve,Just Got Home Bad Day Today
1003073,4,1880277171,Fri May 22 00:40:52 PDT 2009,NO_QUERY,dabe83,@djshelton @mikeyballardo goodnight silly boys.
1457522,4,2063589095,Sun Jun 07 03:16:33 PDT 2009,NO_QUERY,MartinGBEdwards,"@HoptonHouseBnB #elevensestime Oh yes, very a..."
1418599,4,2057818142,Sat Jun 06 13:43:08 PDT 2009,NO_QUERY,laurabethhk,around burlington until the 17th im with out ...
301608,0,1998726943,Mon Jun 01 19:01:35 PDT 2009,NO_QUERY,babygurltl,"Ok so, the paramedics just came to my house, o..."
736866,0,2265104342,Sun Jun 21 05:48:00 PDT 2009,NO_QUERY,dieguitoLAMB,thereÂ´s exactly a 500 gap from following 869 ...
889712,4,1687797936,Sun May 03 09:36:52 PDT 2009,NO_QUERY,LStathis,@BrittanyRS 4days till margarita madness at th...
507354,0,2188879563,Mon Jun 15 22:45:26 PDT 2009,NO_QUERY,_kryshelle,my gastric is hurting like mad even though i j...
370147,0,2050063366,Fri Jun 05 18:00:07 PDT 2009,NO_QUERY,cwiiis,"oh man, my PS3 crashed while I was doing parti..."
308765,0,2000762134,Mon Jun 01 22:53:56 PDT 2009,NO_QUERY,BeccaB67,@DougEWhite Awwwwwww I'm sorry. Now I feel b...


In [13]:
df_tweets = df_tweets[['target', 'text']]

# Preprocessing

In [17]:
# Text preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove URLs, mentions, and special chars
    text = re.sub(r'http\S+|@\w+|[^a-zA-Z\s]', '', text)
    # Tokenize and remove stopwords/stem
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df_tweets['cleaned_text'] = df_tweets['text'].apply(preprocess)

In [22]:
df_tweets.to_csv('df_tweets_cleaned.csv', encoding='utf-8', index=False)

# Feature representation

In [2]:
df_tweets_cleaned = pd.read_csv("df_tweets_cleaned.csv")

In [3]:
df_tweets_cleaned = df_tweets_cleaned[['target', 'cleaned_text']]

df_tweets_cleaned.sample(5)

Unnamed: 0,target,cleaned_text
1504419,4,kathi sympath empath situat price pay quotcivi...
135381,0,found today word ranger anoth term person redo...
878724,4,chillin jay zed
1239032,4,love sunni gud food xelent cmpani gr forget li...
405027,0,nap bad migrain


In [4]:
df_tweets_cleaned['cleaned_text'] = df_tweets_cleaned['cleaned_text'].fillna('tree').astype(str)

In [5]:
X = df_tweets_cleaned['cleaned_text']
y = df_tweets_cleaned['target']

X_train_tf_idf, X_test_tf_idf, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

X_train_tf_idf.shape, X_test_tf_idf.shape, y_train.shape, y_test.shape

((1120000,), (480000,), (1120000,), (480000,))

In [None]:
#X_train = X_train.fillna("")
#X_test = X_test.fillna("")

## TF-IDF

In [30]:
featurizer = TfidfVectorizer()

X_train_tf_idf = featurizer.fit_transform(X_train_tf_idf)
X_test_tf_idf  = featurizer.transform(X_test_tf_idf)

In [31]:
df_tweets_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   target        1600000 non-null  int64 
 1   cleaned_text  1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


## Word2Vec

In [36]:
# Tokenize for Word2Vec
sentences = [text.split() for text in df_tweets_cleaned['cleaned_text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Document embedding (average word vectors)
def document_vector(text):
    words = text.split()
    #return np.mean([w2v_model.wv[word] for word in words if word in w2v_model.wv], axis=0)
    word_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    
    if len(word_vecs) == 0:
        return np.zeros(w2v_model.vector_size)
    
    return np.mean(word_vecs, axis=0)

X_w2v = np.array([document_vector(text) for text in df_tweets_cleaned['cleaned_text']])

In [38]:
X_w2v

array([[ 0.29605904,  1.3709836 , -0.44887289, ..., -0.69944423,
        -0.06612733,  0.0147397 ],
       [-0.01756582,  0.29085168, -0.15293385, ..., -0.74397951,
         0.35941991,  0.20375983],
       [-0.195329  ,  0.65271652, -0.3514432 , ..., -0.73574954,
         0.28448611, -0.02574982],
       ...,
       [-0.68672436,  0.09740587, -0.36710018, ..., -0.48947191,
         0.2504757 ,  0.58563453],
       [-0.84369695,  0.37374285, -0.23528989, ..., -0.53407794,
        -0.35350621,  0.03815288],
       [-1.03638959, -1.27094924,  0.52229124, ..., -0.57438082,
        -0.85335666, -0.0181255 ]])

In [39]:
pd.Series(df_tweets_cleaned['cleaned_text']).isna().sum()

0

In [40]:
X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(X_w2v, y, test_size=0.3, random_state=123)

X_train_w2v.shape, X_test_w2v.shape, y_train.shape, y_test.shape

((1120000, 100), (480000, 100), (1120000,), (480000,))

## GPT

In [6]:
openai.api_key = openai_key

In [7]:
# Use OpenAI's API (requires API key)
def get_gpt_embedding(text):
    response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
    return np.array(response['data'][0]['embedding'])

X_gpt = np.array([get_gpt_embedding(text) for text in df_tweets_cleaned['cleaned_text']])

RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [8]:
!pip install transformers sentence-transformers

Collecting transformers
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading transformers-4.51.2-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   -- -------------------------

In [10]:
# Load a lightweight BERT-based model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and great for embeddings

# Ensure cleaned text is a list of strings
texts = df_tweets_cleaned['cleaned_text'].fillna('').astype(str).tolist()

# Generate embeddings
X_bert = model.encode(texts, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/50000 [00:00<?, ?it/s]

In [11]:
X_bert

array([[ 0.00735882,  0.02461563, -0.0035693 , ..., -0.07428308,
        -0.05914465,  0.06149549],
       [-0.04193585,  0.05599326,  0.0443898 , ...,  0.00782006,
        -0.02131594,  0.02251407],
       [ 0.0264208 ,  0.12049867, -0.01940557, ..., -0.039838  ,
        -0.05383792, -0.03529075],
       ...,
       [-0.04804717,  0.09059407, -0.12849087, ...,  0.02023201,
        -0.00576324, -0.05803538],
       [ 0.03621277,  0.05019556, -0.0422086 , ...,  0.05115667,
         0.02680762, -0.05991103],
       [-0.02255192,  0.136173  , -0.03003125, ..., -0.01166313,
         0.00235243,  0.02435153]], dtype=float32)

In [12]:
X_train_bert, X_test_bert, y_train, y_test = train_test_split(X_bert, y, test_size=0.3, random_state=123)

X_train_bert.shape, X_test_bert.shape, y_train.shape, y_test.shape

((1120000, 384), (480000, 384), (1120000,), (480000,))

# Classification

In [13]:
# Initialize SVM
svm = SVC(kernel='rbf')

# Train/test for each feature:
# TF-IDF
'''svm.fit(X_train, y_train)
y_pred_tfidf = svm.predict(X_test)
print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred_tfidf))

# Word2Vec
svm.fit(X_train_w2v, y_train)
y_pred_w2v = svm.predict(X_test_w2v)
print("Word2Vec Accuracy:", accuracy_score(y_test, y_pred_w2v))'''

# bert embedding
svm.fit(X_train_bert, y_train)
y_pred_gpt = svm.predict(X_test_bert)
print("GPT Accuracy:", accuracy_score(y_test, y_pred_gpt))

: 

: 