In [1]:

# Step 1: Import Libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import re

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\magic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\magic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:


# Step 2: Data Preparation
# Example data
train=pd.read_csv("data/dataset.csv",encoding='latin1')
train['text'] = train.OriginalTweet
train["text"] = train["text"].astype(str)
def classes_def(x):
    if x ==  "Extremely Positive":
        return "2"
    elif x == "Extremely Negative":
        return "0"
    elif x == "Negative":
        return "0"
    elif x ==  "Positive":
        return "2"
    else:
        return "1"


train['label']=train['Sentiment'].apply(lambda x:classes_def(x))
#Remove Urls and HTML links
def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return url_remove.sub(r'', text)
train['text_new']=train['text'].apply(lambda x:remove_urls(x))

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
train['text']=train['text_new'].apply(lambda x:remove_html(x))

# Lower casing
def lower(text):
    low_text= text.lower()
    return low_text
train['text_new']=train['text'].apply(lambda x:lower(x))


# Number removal
def remove_num(text):
    remove= re.sub(r'\d+', '', text)
    return remove
train['text']=train['text_new'].apply(lambda x:remove_num(x))

#Remove stopwords & Punctuations
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def punct_remove(text):
    punct = re.sub(r"[^\w\s\d]","", text)
    return punct
train['text_new']=train['text'].apply(lambda x:punct_remove(x))


def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
train['text']=train['text_new'].apply(lambda x:remove_stopwords(x))

#Remove mentions and hashtags
def remove_mention(x):
    text=re.sub(r'@\w+','',x)
    return text
train['text_new']=train['text'].apply(lambda x:remove_mention(x))

def remove_hash(x):
    text=re.sub(r'#\w+','',x)
    return text
train['text']=train['text_new'].apply(lambda x:remove_hash(x))

#Remove extra white space left while removing stuff
def remove_space(text):
    space_remove = re.sub(r"\s+"," ",text).strip()
    return space_remove
train['text_new']=train['text'].apply(lambda x:remove_space(x))
train = train.drop(columns=['text_new'])

stop_words = ['a', 'an', 'the']

# Basic cleansing
def cleansing(text):
    # Tokenize
    tokens = text.split(' ')
    # Lower case
    tokens = [w.lower() for w in tokens]
    # Remove stop words
    tokens = [w for w in tokens if w not in stop_words]
    return ' '.join(tokens)

# All-in-one preproce
def preprocess_x(x):
    processed_x = [cleansing(text) for text in x]

    return processed_x

train['text_new']=train['text'].apply(lambda x:preprocess_x(x))
X = train["text"].tolist()
y = train["label"].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    random_state = 177)


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Step 3: Feature Extraction

# 3.1: Bag of Words
count_vectorizer = CountVectorizer()
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)

# 3.2: TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 3.3: Word2Vec
def get_word2vec_embeddings(texts, model, vector_size=100):
    embeddings = []
    for text in texts:
        tokens = word_tokenize(text)
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        if vectors:
            embeddings.append(np.mean(vectors, axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)

word2vec_model = Word2Vec(sentences=[word_tokenize(text) for text in X_train], vector_size=100, window=5, min_count=1, workers=8)
X_train_w2v = get_word2vec_embeddings(X_train, word2vec_model)
X_test_w2v = get_word2vec_embeddings(X_test, word2vec_model)

# 3.4: GloVe
def load_glove_embeddings(glove_file):
    glove_model = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_model[word] = vector
    return glove_model

def get_glove_embeddings(texts, glove_model, vector_size=300):
    embeddings = []
    for text in texts:
        tokens = word_tokenize(text)
        vectors = [glove_model[token] for token in tokens if token in glove_model]
        if vectors:
            # Compute the mean of all word vectors for the sentence
            embeddings.append(np.mean(vectors, axis=0))
        else:
            # In case no words in the text are found in the GloVe model, append a zero vector
            embeddings.append(np.zeros(vector_size))
    
    # Ensure all elements in the list have the same size
    embeddings = np.array(embeddings)
    
    return embeddings

glove_file = 'glove.6B.300d.txt'
glove_model = load_glove_embeddings(glove_file)
X_train_glove = get_glove_embeddings(X_train, glove_model)
X_test_glove = get_glove_embeddings(X_test, glove_model)

In [50]:
# len(X_train_w2v[0])
# len(X_train_glove)
# X_train_tfidf.data
# np.set_printoptions(suppress=True)
len(X_train_tfidf[0, :].toarray().flatten())
(X_train_tfidf[0, :].data)
# (X_train_tfidf[0, :].toarray().flatten()[0:1000])
# (X_train_glove)[0]
# (X_train_tfidf).min()



array([0.18990945, 0.20373031, 0.13510446, 0.13896817, 0.31317653,
       0.19711124, 0.36803654, 0.15185621, 0.31078854, 0.1900012 ,
       0.17566364, 0.21144662, 0.27247997, 0.22819204, 0.19109049,
       0.36803654, 0.28788653])

In [64]:

# Step 4: Model Training and Evaluation
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l2'],
    'max_iter': [100, 200, 300]
}

def train_and_evaluate(X_train, y_train, X_test, y_test):
    lr = LogisticRegression(max_iter=1000, solver='lbfgs')
    
    gs = GridSearchCV(lr, param_grid, cv=5, verbose=3, n_jobs=-1)
    gs.fit(X_train, y_train)
    # Get the best parameters and best model
    best_params = gs.best_params_
    best_model = gs.best_estimator_

    # Print best parameters
    print(f"Best parameters: {best_params}")
    predictions = best_model.predict(X_test)
    return classification_report(y_test, predictions)

# Bag of Words
accuracy_bow = train_and_evaluate(X_train_bow, y_train, X_test_bow, y_test)
print(f'Bag of Words:\n {accuracy_bow}')

# TF-IDF
accuracy_tfidf = train_and_evaluate(X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f'TF-IDF:\n {accuracy_tfidf}')

# Word2Vec
accuracy_w2v = train_and_evaluate(X_train_w2v, y_train, X_test_w2v, y_test)
print(f'Word2Vec:\n {accuracy_w2v}')

# GloVe
accuracy_glove = train_and_evaluate(X_train_glove, y_train, X_test_glove, y_test)
print(f'GloVe:\n {accuracy_glove}')

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Bag of Words:
               precision    recall  f1-score   support

           0       0.83      0.81      0.82      3451
           1       0.71      0.72      0.71      1628
           2       0.84      0.86      0.85      3912

    accuracy                           0.81      8991
   macro avg       0.80      0.80      0.80      8991
weighted avg       0.81      0.81      0.81      8991

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
TF-IDF:
               precision    recall  f1-score   support

           0       0.81      0.82      0.82      3451
           1       0.73      0.65      0.69      1628
           2       0.83      0.86      0.85      3912

    accuracy                           0.81      8991
   macro avg       0.79      0.7

In [3]:
from sklearn.metrics import classification_report
# without fine tuning
def train_and_evaluate_no_ft(X_train, y_train, X_test, y_test):
    best_model = LogisticRegression(max_iter=1000, solver='lbfgs')
    best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    return classification_report(y_test, predictions)

In [7]:

# Bag of Words
accuracy_bow = train_and_evaluate_no_ft(X_train_bow, y_train, X_test_bow, y_test)
print(f'Bag of Words:\n {accuracy_bow}')

# TF-IDF
accuracy_tfidf = train_and_evaluate_no_ft(X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f'TF-IDF:\n {accuracy_tfidf}')

# Word2Vec
accuracy_w2v = train_and_evaluate_no_ft(X_train_w2v, y_train, X_test_w2v, y_test)
print(f'Word2Vec:\n {accuracy_w2v}')

# GloVe
accuracy_glove = train_and_evaluate_no_ft(X_train_glove, y_train, X_test_glove, y_test)
print(f'GloVe:\n {accuracy_glove}')


Bag of Words:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81      3451
           1       0.68      0.73      0.70      1628
           2       0.84      0.84      0.84      3912

    accuracy                           0.80      8991
   macro avg       0.78      0.79      0.79      8991
weighted avg       0.81      0.80      0.81      8991

TF-IDF:
               precision    recall  f1-score   support

           0       0.80      0.81      0.80      3451
           1       0.73      0.59      0.66      1628
           2       0.81      0.87      0.84      3912

    accuracy                           0.79      8991
   macro avg       0.78      0.75      0.76      8991
weighted avg       0.79      0.79      0.79      8991

Word2Vec:
               precision    recall  f1-score   support

           0       0.61      0.58      0.59      3451
           1       0.54      0.32      0.40      1628
           2       0.60      0.72      0

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
# 3.5 SFR-Embedding-Mistral
def get_ember_embeddings(texts, tokenizer, model, max_length=128):
    model.eval()
    with torch.no_grad():
        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
        output = model(**encoded_input)
        embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

model_name = "Salesforce/SFR-Embedding-Mistral"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

X_train_ember = get_ember_embeddings(X_train.tolist(), tokenizer, model)
X_test_ember = get_ember_embeddings(X_test.tolist(), tokenizer, model)

accuracy_mistral = train_and_evaluate_no_ft(X_train_ember, y_train, X_test_ember, y_test)
print(f'SFR-Embedding-Mistral:\n {accuracy_mistral}')


In [52]:
from transformers import AutoTokenizer, AutoModel
import torch
# 3.5 SFR-Embedding-Mistral

# from transformers import AutoTokenizer, AutoModel
# import torch
# def get_ember_embeddings(texts, tokenizer, model, max_length=128):
#     model.eval()
#     with torch.no_grad():
#         encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
#         output = model(**encoded_input)
#         embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy()
#     return embeddings

model_name = "llmrails/ember-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

X_train_ember = get_ember_embeddings(X_train, tokenizer, model)
# X_test_ember = get_ember_embeddings(X_test, tokenizer, model)
# print(X_test)
# print(X_train_ember)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-08-26 20:26:29,763 - INFO - Processed batch 1/72 in 2.03 seconds
2024-08-26 20:26:30,885 - INFO - Processed batch 2/72 in 1.12 seconds
2024-08-26 20:26:31,849 - INFO - Processed batch 3/72 in 0.96 seconds
2024-08-26 20:26:32,837 - INFO - Processed batch 4/72 in 0.99 seconds
2024-08-26 20:26:33,911 - INFO - Processed batch 5/72 in 1.07 seconds
2024-08-26 20:26:35,005 - INFO - Processed batch 6/72 in 1.09 seconds
2024-08-26 20:26:35,869 - INFO - Processed batch 7/72 in 0.86 seconds
2024-08-26 20:26:36,792 - INFO - Processed batch 8/72 in 0.92 seconds
2024-08-26 20:26:37,739 - INFO - Processed batch 9/72 in 0.95 seconds
2024-08-26 20:26:38,944 - INFO - Processed batch 10/72 in 1.21 seconds
2024-08-26 20:26:39,976 - INFO - Processed batch 11/72 in 1.03 seconds
2024-08-26 20:26:40,918 - INFO - Processed batch 12/72 in 0.94 seconds
2024-08-26 20:26:41,987 - INFO - Processed batch 13/72 in 1.07 seconds
2024-08-26 20:26:42,

In [63]:
X_train_ember[0][0:900]
# X_train_ember.max()

array([-0.15651827,  0.04394898,  0.20617932,  0.23923256,  0.10326955,
       -0.29402095,  0.39480647,  1.0215219 , -0.15108112,  0.8181932 ,
        0.12444254, -0.5822182 ,  0.14245874, -0.03479619, -0.82616717,
        0.12609835,  0.22270216, -0.45853424, -0.11734218,  0.33428392,
        0.31397095, -0.28627127, -0.94127864, -0.0318335 , -0.7235342 ,
        0.1517778 ,  0.44797742, -0.26657775,  1.0514002 ,  0.5815896 ,
       -0.23330483,  0.2007327 ,  0.64061654, -1.1065009 ,  0.55198014,
       -0.0770203 ,  1.0238037 , -0.56184906, -0.4664912 , -0.5129471 ,
       -0.1803089 , -0.16969097,  0.93656695,  0.07370358, -0.21573305,
       -0.21362841, -0.11025713, -0.22677784,  0.6886989 , -1.3100997 ,
       -0.20212597,  0.45244542,  0.25122148, -0.20160519,  0.36776152,
       -0.49760398, -0.5243846 , -0.06662889,  0.02018975,  0.66073227,
        1.070978  , -0.21025403,  0.85591733, -1.2036958 ,  0.14897268,
        0.20605506, -0.04492993, -0.12535998, -0.12822339, -0.19

In [14]:

# SFR-Embedding-Mistral
accuracy_mistral = train_and_evaluate_no_ft(X_train_ember, y_train, X_test_ember, y_test)
print(f'SFR-Embedding-Mistral:\n {accuracy_mistral}')


SFR-Embedding-Mistral:
               precision    recall  f1-score   support

           0       0.70      0.72      0.71      3451
           1       0.60      0.49      0.54      1628
           2       0.71      0.74      0.72      3912

    accuracy                           0.69      8991
   macro avg       0.67      0.65      0.66      8991
weighted avg       0.69      0.69      0.69      8991



In [5]:
import gc

gc.collect()

torch.cuda.empty_cache()
X_train_ember = get_ember_embeddings(['random text', 'very nice'], tokenizer, model)
print(X_train_ember)


  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-08-14 16:35:59,220 - INFO - Processed batch 1/1 in 0.84 seconds


[[-0.30708066 -0.38886583  0.5703759  ...  0.58131295  0.02450421
  -0.905235  ]
 [ 0.07205359  0.41388914  0.47324252 ... -0.55121666 -0.04188719
   0.7166905 ]]


In [8]:
X_train_ember = get_ember_embeddings(X_train, tokenizer, model)
# print(X_train_mistral)


2024-08-14 16:37:21,132 - INFO - Processed batch 1/72 in 1.42 seconds
2024-08-14 16:37:22,441 - INFO - Processed batch 2/72 in 1.31 seconds
2024-08-14 16:37:23,547 - INFO - Processed batch 3/72 in 1.11 seconds
2024-08-14 16:37:24,696 - INFO - Processed batch 4/72 in 1.15 seconds
2024-08-14 16:37:25,950 - INFO - Processed batch 5/72 in 1.25 seconds
2024-08-14 16:37:27,214 - INFO - Processed batch 6/72 in 1.26 seconds
2024-08-14 16:37:28,211 - INFO - Processed batch 7/72 in 1.00 seconds
2024-08-14 16:37:29,273 - INFO - Processed batch 8/72 in 1.06 seconds
2024-08-14 16:37:30,364 - INFO - Processed batch 9/72 in 1.09 seconds
2024-08-14 16:37:31,773 - INFO - Processed batch 10/72 in 1.41 seconds
2024-08-14 16:37:32,961 - INFO - Processed batch 11/72 in 1.19 seconds
2024-08-14 16:37:34,060 - INFO - Processed batch 12/72 in 1.10 seconds
2024-08-14 16:37:35,305 - INFO - Processed batch 13/72 in 1.24 seconds
2024-08-14 16:37:36,438 - INFO - Processed batch 14/72 in 1.13 seconds
2024-08-14 16:3

In [51]:
from torch.cuda.amp import autocast
import time
import logging
import numpy as np
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_ember_embeddings(texts, tokenizer, model, max_length=128, batch_size=500):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    embeddings = []
    total_batches = len(texts) // batch_size + (1 if len(texts) % batch_size != 0 else 0)
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            start_time = time.time() 
            
            batch_texts = texts[i:i + batch_size]
            encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
            encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
            with autocast():
                output = model(**encoded_input)
            batch_embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy().astype(np.float32)
            embeddings.append(batch_embeddings)
            
            end_time = time.time() 
            elapsed_time = end_time - start_time
            
            logging.info(f'Processed batch {i // batch_size + 1}/{total_batches} in {elapsed_time:.2f} seconds')
    
    return np.vstack(embeddings)


In [6]:
X_train_ember = get_ember_embeddings(['random text', 'very nice'], tokenizer, model)
# X_train_mistral = get_mistral_embeddings(X_train[:15], tokenizer, model)
print(X_train_ember)


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 

In [8]:
import time
start_time = time.time()
len(X_train[:100])
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.0 seconds ---
