In [1]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api


In [4]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/MINI PROJECT/DATASET/augmented_dataset1.csv")

In [5]:
# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [6]:
# Load pre-trained Word2Vec embeddings
wv = api.load('word2vec-google-news-300')



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [8]:
# Apply tokenization to each row in 'Text data' column
df['data'] = df['Text data'].apply(spacy_tokenizer)

In [9]:
# Generate word vectors for each document
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))


In [10]:
X = np.vstack(df['vec'])  # Convert list of arrays to a matrix
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

model_pipeline_rf = Pipeline([
    ('rf', RandomForestClassifier(n_estimators=100, random_state=1))  # Default to 100 trees
])


In [11]:
# Train the model
model_pipeline_rf.fit(X_train, y_train)


In [12]:
# Predict on the test set
y_pred_rf = model_pipeline_rf.predict(X_test)


In [13]:

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_score_rf = f1_score(y_test, y_pred_rf, average='weighted')
classification_report_rf = classification_report(y_test, y_pred_rf)


In [14]:
# Print evaluation metrics
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_score_rf)
print("Evaluation Metrics for Random Forest Model")
print("------------------------------------------------")
print(classification_report_rf)

Accuracy: 0.9507198228128461
Precision: 0.9516779397246176
Recall: 0.9507198228128461
F1 Score: 0.9509009727737082
Evaluation Metrics for Random Forest Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      1194
           1       0.95      0.93      0.94      1236
           2       0.99      0.97      0.98      1182

    accuracy                           0.95      3612
   macro avg       0.95      0.95      0.95      3612
weighted avg       0.95      0.95      0.95      3612

