In [None]:
!pip install indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-8.0.2-py3-none-any.whl.metadata (6.2 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Downloading docutils-0.21.2-py3-none-any.whl.metadata (2.8 kB)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-7.4.7-py3-none-any.whl.metadata (6.1 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Downloading docutils-0.20.1-py3-none-any.whl.metadata (2.8 kB)
Collectin

# Import Libraries and Load Dataset

In [None]:
import pandas as pd
import re
import numpy as np
from gensim.models import FastText, Word2Vec
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.ensemble import RandomForestClassifier
from indicnlp.tokenize import indic_tokenize



In [None]:
# # Load and preprocess dataset
df = pd.read_csv('/content/drive/MyDrive/emotion_filtered.csv')
# df.head()
# df['topic'] = df['topic'].apply(lambda x: x if x in ['entertainment', 'nation'] else 'others')
# c = df['topic'].value_counts()
# c


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/emotion_filtered.csv'

In [None]:
df.head()
y = df['Emotion']

In [None]:
df['Emotion'].value_counts()

# **Text Preprocessing**

In [None]:
def pre_pro(text):
    text = re.sub(r'[^ఁ-౿ ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Sentence'] = df['Sentence'].apply(pre_pro)
print("\nPreprocessed Text:")
print(df.head())

FastText Model Training and Embedding Extraction

In [None]:
df['Sentence'] = df['Sentence'].apply(lambda x: x.split() if isinstance(x, str) else x)
embedding_size = 100 ##embedding_size is the size of the embedding vector.
window_size = 5 #window_size is the size of the number of words
min_word = 6 #min_word, which specifies the minimum frequency of a word
down_sampling = 1e-2 #most frequently occurring word will be down-sampled by a number specified by the down_sampling attribute


ft_model = FastText(sentences = df['Sentence'],vector_size=embedding_size,window=window_size,min_count=min_word,sample=down_sampling,sg=1,epochs=10)
#sg defines if the parameter is skpgram or cbow

print(ft_model.wv)

# Extract word embeddings
X = []
for sentence in df['Sentence']:
    word_embeddings = [ft_model.wv[word] for word in sentence if word in ft_model.wv]
    if word_embeddings:
        sentence_embedding = np.mean(word_embeddings, axis=0)
        X.append(sentence_embedding)
    else:
        X.append(np.zeros(embedding_size))

X = np.array(X)


TF-IDF Embedding Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join the tokenized words back into sentences
df['text_joined'] = df['Sentence'].apply(lambda x: ' '.join(x))

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_joined'])

# `tfidf_matrix` now contains your TF-IDF embeddings
print(tfidf_matrix)

In [None]:
TY = tfidf_matrix

Word2Vec Model Training and Embedding Extraction

In [None]:

from indicnlp.tokenize import indic_tokenize

# Assuming df['body'] contains your Telugu text data
# df = pd.read_csv('/content/TELUGU_NEWS_TEST.csv')  # Load your Telugu data

# Tokenize Telugu sentences
sentences = [indic_tokenize.trivial_tokenize("".join(sentence).lower()) for sentence in df['Sentence']]

# CBOW model
cbow_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=0)

# Skip-gram model
sg_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Save the models
cbow_model.save("telugu_word2vec_cbow.model")
sg_model.save("telugu_word2vec_sg.model")
w2v_model = Word2Vec.load("telugu_word2vec_cbow.model")

def get_document_vector(doc):
    # Check if doc is a list and join it into a string if necessary
    if isinstance(doc, list):
        doc = " ".join(doc)
    words = doc.split()
    word_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(word_vecs) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(word_vecs, axis=0)

# Create document vectors
X_w2v = np.array([get_document_vector(doc) for doc in df['Sentence']])
yw = df['Emotion']

 Train-Test Split

In [None]:
# Split the data
WX_train, WX_test, Wy_train, Wy_test = train_test_split(X_w2v, yw, test_size=0.2, random_state=42)
TX_train, TX_test, Ty_train, Ty_test = train_test_split(TY, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


 SVM Model Training and Evaluation

In [None]:
# SVM with TF-IDF
clf = SVC(kernel='linear')  # You can explore other kernels as well
clf.fit(TX_train, Ty_train)
Ty_pred = clf.predict(TX_test)
print(f'\nSVM with TF-IDF Accuracy: {accuracy_score(Ty_test, Ty_pred)}')
print(classification_report(Ty_test, Ty_pred))

# SVM with FastText
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'\nSVM with FastText Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# SVM with Word2Vec
svm_model = SVC(kernel='linear')
svm_model.fit(WX_train, Wy_train)
yw_pred = svm_model.predict(WX_test)
print(f'\nSVM with Word2Vec Accuracy: {accuracy_score(Wy_test, y_pred)}')
print(classification_report(Wy_test, yw_pred))


KNN Model Training and Evaluation

In [None]:
# KNN with FastText
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)
print(f'\nKNN with FastText Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# KNN with TF-IDF
knn_classifier.fit(TX_train, Ty_train)
Ty_pred = knn_classifier.predict(TX_test)
print(f'\nKNN with TF-IDF Accuracy: {accuracy_score(Ty_test, Ty_pred)}')
print(classification_report(Ty_test, Ty_pred))

# KNN with Word2Vec
knn_classifier.fit(WX_train, Wy_train)
y_pred = knn_classifier.predict(WX_test)
print(f'\nKNN with Word2Vec Accuracy: {accuracy_score(Wy_test, y_pred)}')
print(classification_report(Wy_test, y_pred))


Logistic Regression Model Training and Evaluation

In [None]:
# Logistic Regression with FastText
logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train, y_train)
y_pred = logreg_classifier.predict(X_test)
print(f'\nLogistic Regression with FastText Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Logistic Regression with TF-IDF
logreg_classifier.fit(TX_train, Ty_train)
Ty_pred = logreg_classifier.predict(TX_test)
print(f'\nLogistic Regression with TF-IDF Accuracy: {accuracy_score(Ty_test, Ty_pred)}')
print(classification_report(Ty_test, Ty_pred))

# Logistic Regression with Word2Vec
logreg_classifier.fit(WX_train, Wy_train)
Wy_pred = logreg_classifier.predict(WX_test)
print(f'\nLogistic Regression with Word2Vec Accuracy: {accuracy_score(Wy_test, Wy_pred)}')
print(classification_report(Wy_test, Wy_pred))


Decision Tree Model Training and Evaluation

In [None]:
# Decision Tree with FastText
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
print(f'\nDecision Tree with FastText Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Decision Tree with TF-IDF
dt_classifier.fit(TX_train, Ty_train)
Ty_pred = dt_classifier.predict(TX_test)
print(f'\nDecision Tree with TF-IDF Accuracy: {accuracy_score(Ty_test, Ty_pred)}')
print(classification_report(Ty_test, Ty_pred))

# Decision Tree with Word2Vec
dt_classifier.fit(WX_train, Wy_train)
Wy_pred = dt_classifier.predict(WX_test)
print(f'\nDecision Tree with Word2Vec Accuracy: {accuracy_score(Wy_test, Wy_pred)}')
print(classification_report(Wy_test, Wy_pred))


Naive Bayes Model Training and Evaluation

In [None]:
# Naive Bayes with FastText
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_scaled, y_train)
y_pred = nb_classifier.predict(X_test_scaled)
print(f'\nNaive Bayes with FastText Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Naive Bayes with TF-IDF
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(TX_train)
X_test_scaled = scaler.transform(TX_test)
nb_classifier.fit(X_train_scaled, Ty_train)
Ty_pred = nb_classifier.predict(X_test_scaled)
print(f'\nNaive Bayes with TF-IDF Accuracy: {accuracy_score(Ty_test, Ty_pred)}')
print(classification_report(Ty_test, Ty_pred))

# Naive Bayes with Word2Vec
scaler = MinMaxScaler()
WX_train_scaled = scaler.fit_transform(WX_train)
WX_test_scaled = scaler.transform(WX_test)
nb_classifier.fit(WX_train_scaled, Wy_train)
Wy_pred = nb_classifier.predict(WX_test_scaled)
print(f'\nNaive Bayes with Word2Vec Accuracy: {accuracy_score(Wy_test, Wy_pred)}')
print(classification_report(Wy_test, Wy_pred))


Random Forest Model Training and Evaluation

In [None]:
# Random Forest with FastText
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
print(f'\nRandom Forest with FastText Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Random Forest with TF-IDF
rf_classifier.fit(TX_train, Ty_train)
Ty_pred = rf_classifier.predict(TX_test)
print(f'\nRandom Forest with TF-IDF Accuracy: {accuracy_score(Ty_test, Ty_pred)}')
print(classification_report(Ty_test, Ty_pred))

# Random Forest with Word2Vec
rf_classifier.fit(WX_train, Wy_train)
Wy_pred = rf_classifier.predict(WX_test)
print(f'\nRandom Forest with Word2Vec Accuracy: {accuracy_score(Wy_test, Wy_pred)}')
print(classification_report(Wy_test, Wy_pred))
