In [None]:
%pip install mahaNLP==0.6 -q

# MARATHI SENTIMENT ANALYSIS

In [None]:
# Helper libraries
import pandas as pd
import numpy as np
import re

# Machine Learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# NLP libraries
import nltk
from nltk.stem.snowball import SnowballStemmer

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Reading Data
df = pd.read_csv('/content/final_marathi_data.csv')
df.head()

Unnamed: 0,Sentence,Score
0,चा फक्त नारा देऊन उपयोग नाही महिला अत्याचाराच्या आरोपींना वेळीच कठोर शासनही झालं पाहिजे पण गहुंजे खटल्यात अक्षम्य दिरंगाई झाली आहे महिला सुरक्षेबाबत तत्परतेचे दावे फोल ठरले आहेत गहुंजेच्या आरोपींना फाशी होणेबाबत सरकारने तातडीने कायदेशीर पावले उचलली पाहिजे,1
1,पेट्रोल आणि डिझेलवर एक रूपया अधिभार लावल्याने शेतकर्‍यांना ट्रॅक्टर शेती मालवाहतूक यासाठी आता आणखी भार पडणार आहे शेतकर्‍यांसाठी एकही नवीन घोषणा योजना या अर्थसंकल्पात नाही शेतकरी महिला युवा अशा सर्व वर्गांना निराश करणारा हा अर्थसंकल्प आहे,-1
2,लूट झूट का राज भागावो रोजगार और विकास लावो हा नारा बुलंद करत निघालेल्या युवक काँग्रेसच्या युवा क्रांती यात्रेचे देवगाव फाटा जि वर्धा येथे युवक काँग्रेसच्या कार्यकर्त्यांनी दिमाखात स्वागत केले,1
3,महाराष्ट्र विकास आघाडी सरकारच्या मंत्रिमंडळ विस्तारासाठी विधानमंडळात संपन्न झालेल्या सोहळ्यात मंत्रिपदाची शपथ घेतलेल्या ३६ नवीन मंत्र्यांचे हार्दिक अभिनंदन महाराष्ट्राच्या उत्कर्षासाठी व सर्वांगीण विकासासाठी हे मंत्रिमंडळ कार्यरत राहोयासाठी शुभेच्छा,1
4,पिंपरीत अ‍ॅथलेटिक्स आणि विविध क्षेत्रांतल्या खेळाडूंसोबत बैठक घेतली गुणवान खेळाडूंना सरकारी नोकऱ्यांमध्ये ५ टक्के जागा देण्यासारखे महत्त्वाचे निर्णय आम्ही घेतले तसेच खेळाडूंच्या हिताचे निर्णय घेण्यासाठी नेहमीच कटिबद्ध आहेअसा विश्वास त्यांना दिला,1


In [None]:
# Checking for null values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15164 entries, 0 to 15163
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  15164 non-null  object
 1   Score     15164 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 237.1+ KB


## Data Preprocess

In [None]:
import unicodedata
import regex as re

def is_marathi(text):
    marathi_script_range = re.compile(r'[\p{Devanagari}]+')
    return bool(re.match(marathi_script_range, text))

def filter_non_marathi(text):
    marathi_text = ""
    for word in text.split():
      for char in word:
        if is_marathi(char):
          marathi_text += char
      marathi_text += " "
    return marathi_text[:-1].strip()

def normalize_marathi_text(text):
    normalized_text = unicodedata.normalize('NFKC', text)
    return normalized_text

def remove_punctuation(text):
    punctuation_pattern = re.compile(r'[^\w\s]')
    text_without_punctuation = re.sub(punctuation_pattern, '', text)
    return text_without_punctuation

def remove_marathi_numbers(text):
    pattern = r'[०-९]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

marathi_stopwords = [
    "काही", "ती", "असे", "म्हणून", "याच्य", "मी", "पण", "आण", "आहे", "होत", "आपल्या", "करून", "ते",
    "आहे", "हे", "होत", "आणि", "आम्ही", "आहोत", "त्यानी", "त्याना", "त्यामुळे", "की",
    "ते", "तो", "ती", "याना", "यानी", "असून", "आले", "आली", "आला", "का",
    "करणयात", "किव्हा", "किंव्हा", "करणार", "करत", "करताना", "करते", "करतो",
    "करून", "काम", "काय", "का", "कसा", "कसे", "कुठला", "कुठली", "कुठले",
    "खरच", "गेल्या", "जाऊ", "जात", "जातात", "जाते", "जाणार", "जे", "झाला",
    "झाली", "झाले", "झाल्या", "झालेला", "टा", "टी", "डॉ", "डॉ॰", "तर", "तरी",
    "तरीही", "तसेच", "तसेच", "तसेची", "तसेचे", "तसेचा", "त्या", "त्याचा", "त्याची",
    "त्याच्या", "त्यानी", "त्यामुळे", "त्यास", "त्यांना", "त्यांनी", "त्यांना", "त्यांनी",
    "नाही", "पण", "मात्र", "या", "याचा", "याची", "याच्या", "याना", "यानी", "यास",
    "यांचा", "यांची", "यांच्या", "यांना", "यांनी", "यांच्या", "येणार", "येत",
    "राहिल्या", "व", "व्हा", "व्हाय", "वर", "सुरू", "होत्या", "होती", "होते", "होता"
]

def remove_stopwords(text, stopwords):
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords]
    text_without_stopwords = ' '.join(filtered_words)
    return text_without_stopwords

suffixes = {
    6: ["समाविष्ट","योग्यता", "प्रमाणे"],
    5: ["विषयी", "समोरी", "असून", "तमाना", "यामुळे", "स्वरूप", "तयार", "तयारी", "सारखा",
        "संपूर्ण", "देणगी", "करत",  "सारखी", "समज", "कारण",  "आवड",  "सारखं", "म्हणून"],
    4: ["याच्या", "यांच्या", "योग्य",],
    3: ["च्या", "च्या", "च्या", "लाही", "णार", "णाय", "णारा",
        "णारी", "णारे", "यानी", "याला", "याचा", "याची", "यांचा",
        "यांची", "पणी", "सुरू", "अशा", "च्या", "पण", "शील",
        "पण", "कृत", "कधी", "येणे", "येत", "येते", "पण", "आधी", "मुळे",
        "मात्र", "साठी", "साठी"],
    2: ["ने", "च(्" "से", "त" "नी","चे", "ला", "ची" "ला", "ता", "ती", "ले", "ली", "या", "ये", "णे", "णी", "चा", "ची"],
    1: ["ा", "ी", "ु", "े", "ो"]
}

def stem_marathi_word(word):
    for length, suff_list in suffixes.items():
            for suffix in suff_list:
                if word.endswith(suffix):
                  new_word = word[:-length]
                  if(len(new_word)>1 and new_word[-1] in ["ा", "ी", "ु", "े", "ो"]):
                    return new_word[:-1]
    return word

def stem_marathi_text(text):
    stemmed_words = [stem_marathi_word(word) for word in text.split()]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

def preprocess(df):
  corpus = []
  for mixed_text in df:

    # Normalize Marathi text
    normalized_text = normalize_marathi_text(mixed_text)

    # Fitlter non Marathi words
    newText = filter_non_marathi(normalized_text)

    # Remove Punctuations
    newText = remove_punctuation(newText)

    # Remove numbers
    newText = cleaned_marathi_text = remove_marathi_numbers(newText)

    # Removing Stopwords
    newText = remove_stopwords(newText, marathi_stopwords)

    # Lematization
    newText = stem_marathi_text(newText)

    corpus.append(newText)
  return corpus



In [None]:
# Helper Function to convert probabilities to POSITIVE, NEGATIVE and NEUTRAL class
def get_score(predictions):
    positive_threshold = 0.6
    negative_threshold = 0.5
    sentiment_scores = []
    for pred in predictions:
        if pred >= positive_threshold:
            sentiment_scores.append('Positive')
        elif pred <= negative_threshold:
            sentiment_scores.append('Negative')
        else:
            sentiment_scores.append('Neutral')
    return sentiment_scores

In [None]:
X = preprocess(df["Sentence"])
y = df['Score'].values

## Bow

The bag-of-words model is a model of text which uses a representation of text that is based on an unordered collection (or "bag") of words.

Bag of Words model is used to preprocess the text by converting it into a bag of words, which keeps a count of the total occurrences of most frequently used words.

In [None]:
# Applying the BOW model
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500)

In [None]:
# Fitting the countVectorizer, defining the input and class variables
X = vectorizer.fit_transform(X)
y = df["Score"].to_numpy()

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fitting a simple ML model, Random Forest
rfc1 = RandomForestClassifier()
rfc1.fit(X_train, y_train)

In [None]:
# Testing on Test data, and finding Accuracy
y_pred1 = rfc1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred1)
print(f"Accuracy on the marathi test set: {accuracy}")

Accuracy on the marathi test set: 0.6851302340916584


In [None]:
# Classification Report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
report = classification_report(y_test, y_pred1)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

          -1       0.67      0.72      0.70       963
           0       0.67      0.68      0.68      1066
           1       0.71      0.65      0.68      1004

    accuracy                           0.69      3033
   macro avg       0.69      0.69      0.69      3033
weighted avg       0.69      0.69      0.68      3033



In [None]:
# Testing on unseen data 1
sent = "आजचा संघर्ष उद्याचे सामर्थ्य निर्माण करतो, विचार बदला आयुष्य बदलेल."
preprocessed_text = preprocess([sent])
final_text = vectorizer.transform(preprocessed_text)

In [None]:
y_pred = rfc1.predict_proba(final_text)[:, 1]
sentiment_scores = get_score(y_pred)
print(sentiment_scores)

['Positive']


In [None]:
# Testing on unseen data 2
sent = "त्याच्या स्नेहातून आम्हाला विश्वासघात झाला."
preprocessed_text = preprocess([sent])
final_text = vectorizer.transform(preprocessed_text)

In [None]:
y_pred = rfc1.predict_proba(final_text)[:, 1]
sentiment_scores = get_score(y_pred)
print(sentiment_scores)

['Negative']


## Word Embeddings using MahaNLP libraries

Word Embeddings are numeric representations of words in a lower-dimensional space, capturing semantic and syntactic information.

In [None]:
import mahaNLP
from mahaNLP.preprocess import Preprocess
from mahaNLP.tokenizer import Tokenize
from mahaNLP.similarity import SimilarityAnalyzer

# Creating Objects
tokenizer = Tokenize()
preprocessor = Preprocess()
sa = SimilarityAnalyzer()

In [None]:
# Simple Representation of Embeddings using MahaNLP
text = 'चा फक्त नारा देऊन उपयोग नाही महिला अत्याचाराच्या आरोपींना वेळीच कठोर शासनही झालं पाहिजे पण गहुंजे खटल्यात अक्षम्य दिरंगाई झाली आहे महिला सुरक्षेबाबत तत्परतेचे दावे फोल ठरले आहेत गहुंजेच्या आरोपींना फाशी होणेबाबत सरकारने तातडीने कायदेशीर पावले उचलली पाहिजे'
vector = sa.embed_sentences(text)
print(vector)
print("Length of a Vector:", len(vector))

[-2.26324443e-02 -4.97051165e-04 -2.56957728e-02  2.28839251e-03
 -1.42963519e-02 -1.10252956e-02 -1.11475708e-02 -7.22294673e-03
 -4.45532240e-03  8.30940064e-03 -5.96412923e-03  4.56204219e-03
  1.65175125e-02 -9.67977941e-03 -1.29048387e-02 -1.10543622e-02
  2.63414346e-03 -3.42494645e-03  8.76469817e-03  7.57106580e-04
 -8.43753759e-03 -4.14035609e-03 -3.93050024e-03  1.75644252e-02
 -4.34277393e-03  2.12560017e-02  4.18456877e-03  3.05200787e-03
  7.12956628e-03  8.12358782e-03  1.94603682e-03  1.49035882e-02
 -1.34547299e-03  2.91491044e-03 -5.45054208e-03  1.33419060e-03
 -2.41460782e-02 -4.34481865e-03  5.59834670e-03 -1.34870922e-02
 -1.45083638e-02  4.35144058e-04 -9.54359304e-03 -3.48413596e-04
  2.06707953e-03 -4.82641160e-04 -1.01466794e-02 -1.52115747e-02
 -4.46726847e-03  3.88901425e-03  1.00916158e-02 -3.27809416e-02
  3.50684747e-02  9.42701567e-03  6.42330502e-04 -1.29294852e-02
  4.22950659e-04 -1.09158922e-02 -4.55000252e-03 -4.28421702e-03
  1.15404241e-02  3.78591

In [None]:
# Helper function to preprocess the data
# 1.) Remove Stopwords
# 2.) Vectorize data

def preprocess_using_mahaNLP(df):
  corpus = []
  for sent in df:
      text = preprocessor.remove_stopwords(sent)
      text = " ".join(text)
      vector = sa.embed_sentences(text)
      corpus.append(vector)
  return corpus

In [None]:
# Defining X and y
X = preprocess_using_mahaNLP(df['Sentence'].iloc[:5000])
y = df['Score'].iloc[:5000]

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fitting a simple ML model, Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
# Accuracy
y_pred = rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the marathi test set: {accuracy}")

Accuracy on the marathi test set: 0.774


In [None]:
# Classification Report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

          -1       0.75      0.86      0.80       322
           0       0.76      0.72      0.74       345
           1       0.81      0.74      0.78       333

    accuracy                           0.77      1000
   macro avg       0.78      0.78      0.77      1000
weighted avg       0.78      0.77      0.77      1000



## TF-IDF

TF-IDF stands for Term Frequency Inverse Document Frequency of records. It can be defined as the calculation of how relevant a word in a series or corpus is to a text. The meaning increases proportionally to the number of times in the text a word appears but is compensated by the word frequency in the corpus (data-set).

In [None]:
# Create object
tfidf_vectorizer = TfidfVectorizer(max_features=1500)

In [None]:
X = df["Sentence"].values
y = df["Score"].values

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Get tf-df values
X_train_idf = tfidf_vectorizer.fit_transform(X_train)
X_test_idf = tfidf_vectorizer.transform(X_test)

In [None]:
# Fit ML model
rfc2 = RandomForestClassifier()
rfc2.fit(X_train_idf, y_train)

In [None]:
# Accuracy
y_pred2 = rfc2.predict(X_test_idf)
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy on the marathi test set: {accuracy}")

Accuracy on the marathi test set: 0.6907352456313881


In [None]:
# Classification Report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
report = classification_report(y_test, y_pred2)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

          -1       0.67      0.78      0.72       963
           0       0.68      0.69      0.69      1066
           1       0.74      0.61      0.67      1004

    accuracy                           0.69      3033
   macro avg       0.69      0.69      0.69      3033
weighted avg       0.69      0.69      0.69      3033



In [None]:
# Testing on Unseen data 1
sent = "तुमच्यावर मला अप्रतिम विश्वास आहे."
preprocessed_text = preprocess([sent])
final_text = tfidf_vectorizer.transform(preprocessed_text)

In [None]:
y_pred = rfc1.predict_proba(final_text)[:, 1]
sentiment_scores = get_score(y_pred)
print(sentiment_scores)

['Positive']


In [None]:
# Testing on Unseen data 2
sent = "माझ्या स्वप्नांचं निराशा होतं, कारण त्यांनी माझ्यावर विश्वास घालायला असं नको."
preprocessed_text = preprocess([sent])
final_text = tfidf_vectorizer.transform(preprocessed_text)

In [None]:
y_pred = rfc1.predict_proba(final_text)[:, 1]
sentiment_scores = get_score(y_pred)
print(sentiment_scores)

['Neutral']


### Other models

In [None]:
!pip install lazypredict -q

In [None]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_df, X_test_df, y_train, y_test)
print(models)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(conf_matrix)