In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from google.colab import drive
from gensim.models import KeyedVectors



In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/mini sharanya/dataset/augmented_dataset1 (1).csv')
df = df[['Text data', 'Label']]
df.head()

Unnamed: 0,Text data,Label
0,waiting for my mind to have breakdown once the...,1
1,for my new anymore little bit not of just and ...,1
2,new year feeling there else depressed last eve...,1
3,for my to have the new feeling know about anyo...,1
4,to the new year in start and into great myself...,1


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Text data'], df['Label'], test_size=0.2, random_state=42)

# Convert to plain strings
X_train = X_train.astype(str)
X_test = X_test.astype(str)


In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)


In [None]:
# Load pre-trained GloVe embeddings
glove_file = '/content/drive/MyDrive/mini sharanya/dataset/glove.6B-20240619T062559Z-002/glove.6B/glove.6B.200d.txt'
word_vectors = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        word_vectors[word] = vector

# Define a function to map words to GloVe embeddings
def map_word_to_glove(word):
    return word_vectors.get(word, np.zeros(200))  # Return 200-dimensional zero vector for out-of-vocabulary words

# Create document embeddings
def create_document_embedding(doc):
    words = doc.split()
    if not words:  # if the document is empty
        return np.zeros(200)
    word_embeddings = [map_word_to_glove(word) for word in words]
    doc_embedding = np.mean(word_embeddings, axis=0)  # Average word embeddings
    return doc_embedding

# Create document embeddings for training and testing data
X_train_glove = [create_document_embedding(doc) for doc in X_train]
X_test_glove = [create_document_embedding(doc) for doc in X_test]



In [None]:
# Train Decision Tree Models
dt_tfidf = DecisionTreeClassifier(random_state=42)
dt_count = DecisionTreeClassifier(random_state=42)
dt_glove = DecisionTreeClassifier(random_state=42)

dt_tfidf.fit(X_train_tfidf, y_train)
dt_count.fit(X_train_count, y_train)
dt_glove.fit(X_train_glove, y_train)

In [None]:
# Make Predictions and Evaluate
y_pred_tfidf = dt_tfidf.predict(X_test_tfidf)
y_pred_count = dt_count.predict(X_test_count)
y_pred_glove = dt_glove.predict(X_test_glove)

accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
accuracy_count = accuracy_score(y_test, y_pred_count)
accuracy_glove = accuracy_score(y_test, y_pred_glove)

report_tfidf = classification_report(y_test, y_pred_tfidf)
report_count = classification_report(y_test, y_pred_count)
report_glove = classification_report(y_test, y_pred_glove)

print(f'TF-IDF Accuracy: {accuracy_tfidf}')
print('TF-IDF Classification Report:')
print(report_tfidf)

print(f'Count Vectorization Accuracy: {accuracy_count}')
print('Count Vectorization Classification Report:')
print(report_count)

print(f"GloVe Accuracy: {accuracy_glove}")
print('GloVe Classification Report:')
print(report_glove)

TF-IDF Accuracy: 0.9343853820598007
TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1245
           1       0.92      0.93      0.92      1196
           2       0.95      0.96      0.96      1171

    accuracy                           0.93      3612
   macro avg       0.93      0.93      0.93      3612
weighted avg       0.93      0.93      0.93      3612

Count Vectorization Accuracy: 0.9396456256921373
Count Vectorization Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      1245
           1       0.92      0.94      0.93      1196
           2       0.95      0.97      0.96      1171

    accuracy                           0.94      3612
   macro avg       0.94      0.94      0.94      3612
weighted avg       0.94      0.94      0.94      3612

GloVe Accuracy: 0.9180509413067552
GloVe Classification Report:
              pr

In [81]:
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier  # Import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/mini sharanya/dataset/augmented_dataset1 (1).csv")

# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

# Load pre-trained Word2Vec embeddings
wv = api.load('word2vec-google-news-300')

# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

# Apply tokenization to each row in 'data' column
df['data'] = df['Text data'].apply(spacy_tokenizer)

# Generate word vectors for each document
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))

X = np.vstack(df['vec'])  # Convert list of arrays to a matrix
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

# Define the pipeline with DecisionTreeClassifier
model_pipeline_dt = Pipeline([
    ('dt', DecisionTreeClassifier())  ])

# Train the model
model_pipeline_dt.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = model_pipeline_dt.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_score_dt = f1_score(y_test, y_pred_dt, average='weighted')
classification_report_dt = classification_report(y_test, y_pred_dt)

# Print evaluation metrics
print("Evaluation Metrics for Decision Tree Model")
print("------------------------------------------------")
print(f'word 2 vectorizer Accuracy: {accuracy_dt}')
print('word 2 vectorizer Report:')
print(classification_report_dt)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Evaluation Metrics for Decision Tree Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      1194
           1       0.90      0.93      0.91      1236
           2       0.95      0.95      0.95      1182

    accuracy                           0.92      3612
   macro avg       0.92      0.92      0.92      3612
weighted avg       0.92      0.92      0.92      3612

word 2 vectorizer Accuracy: 0.9227574750830565
word 2 vectorizer Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      1194
           1       0.90      0.93      0.91      1236
           2       0.95      0.95      0.95      1182

    accuracy                           0.92      3612
   macro avg       0.92      0.92      0.92      3612