In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download required NLTK resources
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

# Load the dataset
data = pd.read_csv(r"C:\Users\Acer\Labs\DSMM_Term_02\BhavikG_App1034\Project\dev-br\dev\cleaned_data.csv")

# Fill any missing values in 'cleaned_text' with an empty string
data['cleaned_text'] = data['cleaned_text'].fillna('')

# Initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for tokenization and lemmatization
def tokenize_and_lemmatize(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    # Lemmatize each token
    lemmatized_text = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return ' '.join(lemmatized_text)
    

# Apply the function to the 'cleaned_text' column
data['lemmatized_text'] = data['cleaned_text'].apply(tokenize_and_lemmatize)

# Save the processed data to a new CSV file to avoid reprocessing next time
data.to_csv(r"C:\Users\Acer\Labs\DSMM_Term_02\BhavikG_App1034\Project\dev-br\dev\lemmatized_data.csv", index=False)

# Display the results
print(data[['cleaned_text', 'lemmatized_text']].head())



In [7]:
# Check if the columns "cleaned_text" and "lemmatized_text" are the same
data['is_same'] = data['cleaned_text'] == data['lemmatized_text']

# Display rows where the columns are not the same
different_rows = data[data['is_same'] == False]

# Display results
print(f"Number of rows where the columns are the same: {data['is_same'].sum()}")
print(f"Number of rows where the columns are different: {len(different_rows)}")

# Optionally display the rows where they are different
print(different_rows[['cleaned_text', 'lemmatized_text']])

Number of rows where the columns are the same: 964328
Number of rows where the columns are different: 635672
                                              cleaned_text  \
2        dived many times ball managed save rest go bounds   
3                         whole body feels itchy like fire   
7        hey long time see yes rains bit bit lol im fin...   
11                                          repierced ears   
13                    counts idk either never talk anymore   
...                                                    ...   
1599986                                      much ads blog   
1599988  ha good job thats right gotta throw bigrun tag...   
1599991  mmmm sounds absolutely perfect schedule full w...   
1599996            thewdbcom cool hear old walt interviews   
1599997                    ready mojo makeover ask details   

                                           lemmatized_text  
2          dived many time ball managed save rest go bound  
3                       

## TF-IDF

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

# Load the lemmatized dataset
data = pd.read_csv(r"C:\Users\Acer\Labs\DSMM_Term_02\BhavikG_App1034\Project\dev-br\dev\lemmatized_data.csv")

# Ensure there are no NaN values in the 'lemmatized_text' column
data['lemmatized_text'] = data['lemmatized_text'].fillna('')

# Prepare features and target
X = data['lemmatized_text']
y = data['target']  # 'target' column has sentiment labels (1 for positive, 0 for negative)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert text data to numerical vectors using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, log_reg.predict_proba(X_test_tfidf)[:, 1]))

# Hyperparameter Tuning
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
y_pred_best = best_model.predict(X_test_tfidf)
print("Best Model Accuracy:", accuracy_score(y_test, y_pred_best))


Accuracy: 0.77440625
Confusion Matrix:
 [[120342  39658]
 [ 32532 127468]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77    160000
           1       0.76      0.80      0.78    160000

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

AUC Score: 0.8542522505664063
Best Parameters: {'C': 10}
Best Model Accuracy: 0.774175


In [13]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl.metadata (8.2 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     --------------------------- ------------ 41.0/60.6 kB 2.0 MB/s eta 0:00:01
     ---------------------------------------- 60.6/60.6 kB 1.6 MB/s eta 0:00:00
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.2/24.0 MB 6.4 MB/s eta 0:00:04
    --------------------------------------- 0.5/24.0 MB 5.8 MB/s eta 0:00:05
   - -------------------------------------- 0.8/24.0 MB 6.1 MB/s eta 0:00:04
   - -------------------------------------- 1.1/24.0 MB 6.1 MB/s eta 0:00:04
   -- ----------

  You can safely remove it manually.
  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

# Example corpus
corpus = data['lemmatized_text'].tolist()  # Assuming `lemmatized_text` is preprocessed

# Target labels
y = data['target']  # Sentiment labels

# Train Word2Vec embeddings
w2v_model = Word2Vec(sentences=[doc.split() for doc in corpus], vector_size=100, window=5, min_count=1, workers=4)

# Generate TF-IDF scores
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(corpus)
tfidf_vocab = tfidf.vocabulary_  # Maps words to their index in TF-IDF

# Create a function to compute TF-IDF weighted Word2Vec vectors for each document
def tfidf_weighted_avg(doc):
    words = doc.split()
    word_vecs = []
    for word in words:
        if word in w2v_model.wv.key_to_index:  # Check if word exists in Word2Vec model
            tfidf_weight = tfidf.idf_[tfidf_vocab.get(word, 0)]  # Get TF-IDF weight
            word_vec = w2v_model.wv[word] * tfidf_weight  # Weight Word2Vec vector by TF-IDF
            word_vecs.append(word_vec)
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(w2v_model.vector_size)

# Apply the function to generate TF-IDF weighted Word2Vec vectors for each document
X = np.array([tfidf_weighted_avg(doc) for doc in corpus])

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1]))

# Hyperparameter Tuning
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
y_pred_best = best_model.predict(X_test_tfidf)
print("Best Model Accuracy:", accuracy_score(y_test, y_pred_best))

Accuracy: 0.71345625
Confusion Matrix:
 [[113095  46905]
 [ 44789 115211]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.71      0.71    160000
           1       0.71      0.72      0.72    160000

    accuracy                           0.71    320000
   macro avg       0.71      0.71      0.71    320000
weighted avg       0.71      0.71      0.71    320000

AUC Score: 0.786049748359375
Best Parameters: {'C': 10}
Best Model Accuracy: 0.774175
