In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

In [3]:
df = pd.read_csv("/content/drive/MyDrive/MINI PROJECT/DATASET/augmented_dataset1.csv")

In [4]:
# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation


In [5]:
# Load pre-trained Word2Vec embeddings
wv = api.load('word2vec-google-news-300')



In [6]:
# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [7]:
# Apply tokenization to each row in 'data' column
df['data'] = df['Text data'].apply(spacy_tokenizer)

In [8]:
# Generate word vectors for each document
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))


In [9]:
X = np.vstack(df['vec'])  # Convert list of arrays to a matrix
y = df['Label']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [11]:
model_pipeline_lr = Pipeline([
    ('lr', LogisticRegression(max_iter=1000))  # Increase max_iter to 1000 or more
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred = model_pipeline_lr.predict(X_test)

In [12]:
# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred)
precision_lr = precision_score(y_test, y_pred, average='weighted')
recall_lr = recall_score(y_test, y_pred, average='weighted')
f1_score_lr = f1_score(y_test, y_pred, average='weighted')
classification_report_lr = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("F1 Score:", f1_score_lr)
print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Accuracy: 0.7724252491694352
Precision: 0.772735981108846
Recall: 0.7724252491694352
F1 Score: 0.771191234492083
Evaluation Metrics for Logistic Regression Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.73      0.76      1194
           1       0.74      0.72      0.73      1236
           2       0.78      0.87      0.82      1182

    accuracy                           0.77      3612
   macro avg       0.77      0.77      0.77      3612
weighted avg       0.77      0.77      0.77      3612

