In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api



In [3]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/MINI PROJECT/DATASET/augmented_dataset1.csv")


In [4]:
# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation


In [6]:
# Load pre-trained GloVe embeddings
wv = api.load('glove-twitter-100')



In [7]:
# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [8]:
# Apply tokenization to each row in 'Text data' column
df['data'] = df['Text data'].apply(spacy_tokenizer)


In [9]:
# Generate word vectors for each document
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))


In [10]:

X = np.vstack(df['vec'])  # Convert list of arrays to a matrix
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)


In [11]:
# Initialize and train Naive Bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

In [12]:
# Predict on the test set
y_pred_nb = naive_bayes.predict(X_test)

In [13]:
# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_score_nb = f1_score(y_test, y_pred_nb, average='weighted')
classification_report_nb = classification_report(y_test, y_pred_nb)


In [14]:
# Print evaluation metrics
print("Accuracy:", accuracy_nb)
print("Precision:", precision_nb)
print("Recall:", recall_nb)
print("F1 Score:", f1_score_nb)
print("Evaluation Metrics for Naive Bayes Model")
print("------------------------------------------------")
print(classification_report_nb)

Accuracy: 0.5121816168327796
Precision: 0.5259164223537308
Recall: 0.5121816168327796
F1 Score: 0.4949669790200455
Evaluation Metrics for Naive Bayes Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.58      0.44      0.50      1194
           1       0.52      0.32      0.40      1236
           2       0.48      0.79      0.60      1182

    accuracy                           0.51      3612
   macro avg       0.53      0.52      0.50      3612
weighted avg       0.53      0.51      0.49      3612



In [19]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/MINI PROJECT/DATASET/augmented_dataset1.csv")

# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    doc = nlp(sentence)
    mytokens = [word.lemma_.lower().strip() for word in doc]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return mytokens

# Apply tokenization to each row in 'Text data' column
df['tokens'] = df['Text data'].apply(spacy_tokenizer)

# Generate document vectors for each document
def document_vector(tokens, embeddings, dim):
    token_vectors = [embeddings[token] for token in tokens if token in embeddings]
    if not token_vectors:
        return np.zeros(dim)
    return np.mean(token_vectors, axis=0)

df['vec'] = df['tokens'].apply(lambda x: document_vector(x, wv, wv.vector_size))

# Prepare features (X) and labels (y)
X = np.vstack(df['vec'])
y = df['Label']

# Normalize features for Naive Bayes
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.8, random_state=1)

# Initialize and train Naive Bayes model
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = naive_bayes.predict(X_test)

# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_score_nb = f1_score(y_test, y_pred_nb, average='weighted')
classification_report_nb = classification_report(y_test, y_pred_nb)

# Print evaluation metrics
print("Evaluation Metrics for Naive Bayes Model")
print("------------------------------------------------")
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"Precision: {precision_nb:.4f}")
print(f"Recall: {recall_nb:.4f}")
print(f"F1-score: {f1_score_nb:.4f}")
print("Classification Report:")
print(classification_report_nb)


Evaluation Metrics for Naive Bayes Model
------------------------------------------------
Accuracy: 0.5648
Precision: 0.5774
Recall: 0.5648
F1-score: 0.5610
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.49      0.56      1194
           1       0.57      0.49      0.52      1236
           2       0.52      0.72      0.60      1182

    accuracy                           0.56      3612
   macro avg       0.58      0.57      0.56      3612
weighted avg       0.58      0.56      0.56      3612

