In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

In [2]:
# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [11]:
# Load pre-trained Word2Vec embeddings
wv = api.load('word2vec-google-news-300')



In [3]:
# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [18]:
# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/MINI PROJECT/DATASET/augmented_dataset1.csv")  # Replace with your dataset path


In [20]:
# Apply tokenization to each row in 'Text data' column
df['data'] = df['Text data'].apply(spacy_tokenizer)

In [21]:
# Generate word vectors for each document
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))


In [22]:
# Convert list of arrays to a matrix
X = np.vstack(df['vec'])
y = df['Label']

In [23]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)


In [24]:
# Initialize and train Naive Bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

In [25]:
# Predict on the test set
y_pred_nb = naive_bayes.predict(X_test)


In [26]:
# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_score_nb = f1_score(y_test, y_pred_nb, average='weighted')
classification_report_nb = classification_report(y_test, y_pred_nb)

# Print evaluation metrics
print("Accuracy:", accuracy_nb)
print("Precision:", precision_nb)
print("Recall:", recall_nb)
print("F1 Score:", f1_score_nb)
print("Evaluation Metrics for Naive Bayes Model")
print("------------------------------------------------")
print(classification_report_nb)

Accuracy: 0.47646733111849393
Precision: 0.48937859073229817
Recall: 0.47646733111849393
F1 Score: 0.4523513475667516
Evaluation Metrics for Naive Bayes Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.56      0.39      0.46      1194
           1       0.46      0.26      0.33      1236
           2       0.45      0.79      0.57      1182

    accuracy                           0.48      3612
   macro avg       0.49      0.48      0.45      3612
weighted avg       0.49      0.48      0.45      3612

