In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from gensim.models import Word2Vec

In [None]:
# Load the dataset
df = pd.read_csv('/content/augmented_dataset1.csv')
df = df[['Text data', 'Label']]
df.head()

Unnamed: 0,Text data,Label
0,waiting for my mind to have breakdown once the...,1
1,for my new anymore little bit not of just and ...,1
2,new year feeling there else depressed last eve...,1
3,for my to have the new feeling know about anyo...,1
4,to the new year in start and into great myself...,1


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Text data'], df['Label'], test_size=0.2, random_state=42)

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [33]:
# Load pre-trained GloVe embeddings
glove_file = '/content/drive/MyDrive/glove.6B.100d.txt'
word_vectors = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        word_vectors[word] = vector

In [34]:
def map_word_to_glove(word):
    return word_vectors.get(word, np.zeros(100))

In [35]:
# Create document embeddings
def create_document_embedding(doc):
    words = doc.split()
    word_embeddings = [map_word_to_glove(word) for word in words]
    doc_embedding = np.mean(word_embeddings, axis=0)  # Average word embeddings
    return doc_embedding

In [36]:
X_train_glove = np.array([create_document_embedding(doc) for doc in X_train])
X_test_glove = np.array([create_document_embedding(doc) for doc in X_test])

In [43]:
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation


In [44]:
wv = api.load('word2vec-google-news-300')



In [45]:
# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [46]:
df['data'] = df['Text data'].apply(spacy_tokenizer)

In [47]:
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))

In [48]:
X = np.vstack(df['vec'])  # Convert list of arrays to a matrix
y = df['Label']

In [49]:

# Initialize SVM classifiers
svm_tfidf = SVC(kernel='linear', random_state=42)
svm_count = SVC(kernel='linear', random_state=42)
svm_w2v = SVC(kernel='linear', random_state=42)
svm_glove = SVC(kernel='linear', random_state=42)

# Fit SVM classifiers
svm_tfidf.fit(X_train_tfidf, y_train)
svm_count.fit(X_train_count, y_train)
svm_w2v.fit(X_train_w2v, y_train)
svm_glove.fit(X_train_glove, y_train)


In [50]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)
y_pred_count = svm_count.predict(X_test_count)
y_pred_w2v = svm_w2v.predict(X_test_w2v)
y_pred_glove = svm_glove.predict(X_test_glove)

# Evaluate accuracy
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
accuracy_count = accuracy_score(y_test, y_pred_count)
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
accuracy_glove = accuracy_score(y_test, y_pred_glove)

# Generate classification reports
report_tfidf = classification_report(y_test, y_pred_tfidf)
report_count = classification_report(y_test, y_pred_count)
report_w2v = classification_report(y_test, y_pred_w2v)
report_glove = classification_report(y_test, y_pred_glove)

# Print results
print(f'TF-IDF Accuracy: {accuracy_tfidf}')
print('TF-IDF Classification Report:')
print(report_tfidf)

print(f'Count Vectorization Accuracy: {accuracy_count}')
print('Count Vectorization Classification Report:')
print(report_count)

print(f'Word2Vec Accuracy: {accuracy_w2v}')
print('Word2Vec Classification Report:')
print(report_w2v)

print(f'GloVe Accuracy: {accuracy_glove}')
print('GloVe Classification Report:')
print(report_glove)


TF-IDF Accuracy: 0.95874861572536
TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1245
           1       0.93      0.95      0.94      1196
           2       0.99      0.98      0.98      1171

    accuracy                           0.96      3612
   macro avg       0.96      0.96      0.96      3612
weighted avg       0.96      0.96      0.96      3612

Count Vectorization Accuracy: 0.9493355481727574
Count Vectorization Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1245
           1       0.94      0.93      0.93      1196
           2       0.97      0.97      0.97      1171

    accuracy                           0.95      3612
   macro avg       0.95      0.95      0.95      3612
weighted avg       0.95      0.95      0.95      3612

Word2Vec Accuracy: 0.3754152823920266
Word2Vec Classification Report:
            