In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api


In [4]:
# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [5]:
# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [6]:
# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/MINI PROJECT/DATASET/augmented_dataset1.csv")  # Replace with your dataset path
X = df['Text data']  # Replace with your text column name
y = df['Label']      # Replace with your label column name


In [7]:
# Apply tokenization to each row in 'Text data' column
df['data'] = df['Text data'].apply(spacy_tokenizer)

In [11]:
# Load pre-trained Word2Vec embeddings
wv = api.load('word2vec-google-news-300')




In [14]:
# Generate word vectors for each document
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))


In [15]:
# Convert list of arrays to a matrix
X = np.vstack(df['vec'])
y = df['Label']


In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)


In [17]:
# Initialize and train SVM model using a pipeline
model_pipeline_svm = Pipeline([
    ('svm', SVC(kernel='linear', random_state=1))  # Using a linear kernel
])


In [18]:
# Train the model
model_pipeline_svm.fit(X_train, y_train)

In [19]:
# Predict on the test set
y_pred_svm = model_pipeline_svm.predict(X_test)


In [20]:

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_score_svm = f1_score(y_test, y_pred_svm, average='weighted')
classification_report_svm = classification_report(y_test, y_pred_svm)

# Print evaluation metrics
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_score_svm)
print("Evaluation Metrics for SVM Model")
print("------------------------------------------------")
print(classification_report_svm)

Accuracy: 0.7757475083056479
Precision: 0.7772146253772279
Recall: 0.7757475083056479
F1 Score: 0.7740640092939256
Evaluation Metrics for SVM Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.73      0.77      1194
           1       0.75      0.71      0.73      1236
           2       0.76      0.89      0.82      1182

    accuracy                           0.78      3612
   macro avg       0.78      0.78      0.77      3612
weighted avg       0.78      0.78      0.77      3612

