In [2]:
import pandas as pd
import math
from collections import Counter
import numpy as np
import pickle
from sklearn import metrics
import re

In [3]:
df = pd.read_csv('./data/dataset.csv')
df = df.loc[:15000]
df.shape

(15001, 2)

In [4]:
first_col = df.iloc[1:, 0]
second_col = df.iloc[1:, 1]
second_col = second_col.fillna(0)

In [5]:
stop_words_file = open("./data/stopwords.txt","r",encoding="utf-8")
stop_words = stop_words_file.read()
stop_words = stop_words.split("\n")

In [6]:
#data cleaning method
def data_cleaning(string):
    text = re.sub('\,|\@|\-|\"|\'| \)|\(|\)| \{| \}| \[| \]|!|‘|’|“|”| \:-|\?|।|/|\—|\०|\१|\२|\३|\४|\५|\६|\७|\८|\९|[0-9]', '', string)
    return text

def stop_word_remove(array_element):
    array_element_set = set(array_element)
    final_list = list(array_element_set.difference(stop_words))
    return final_list

In [7]:

def tokenize():
    data_with_split = []
    for data in first_col:
        return_string = data_cleaning(data)
        each_docs = return_string.split(" ")
        string_after_remove_word=stop_word_remove(each_docs)
        
        data_with_split.append(each_docs)
    return data_with_split  # it returns arr of each docs with spleted words



corpus = tokenize()

In [8]:
class TFIDFVectorizer:
    def __init__(self):
        self.vocabulary = None
        self.idf = None
        self.vocabulary = set()
        
        # Build vocabulary
        for document in corpus:
            self.vocabulary.update(document)
        self.vocabulary = list(self.vocabulary)

    def get_tif_idf_info(self,words,sentence_feature):
        tf_idf_info = {}
        for word in words:
            index = self.vocabulary.index(word)
            tf_idf_info[word]=sentence_feature[0][index]
        return tf_idf_info
    
    def fit_transform(self, corpus):
        # Calculate IDF
        idf = {}
        N = len(corpus)
        for term in self.vocabulary:
            df = sum(1 for document in corpus if term in document)
            idf[term] = math.log(N / (1 + df))

        # Transform documents to TF-IDF representation
        tfidf_matrix = np.zeros((len(corpus), len(self.vocabulary)))
        for i, document in enumerate(corpus):
            tf = Counter(document)
            total_terms = len(document)
            for j, term in enumerate(self.vocabulary):
                if total_terms != 0:
                    tfidf_matrix[i, j] = (tf.get(term, 0) / total_terms) * idf[term]
                else:
                    tfidf_matrix[i, j] = 0  # Set TF-IDF to 0 if total_terms is 0

        self.idf = idf
        return tfidf_matrix

    def transform(self, corpus):
        tfidf_matrix = np.zeros((len(corpus), len(self.vocabulary)))
        for i, document in enumerate(corpus):
            tf = Counter(document)
            total_terms = len(document)
            for j, term in enumerate(self.vocabulary):
                if total_terms != 0:
                    tfidf_matrix[i, j] = (tf.get(term, 0) / total_terms) * self.idf.get(term, 0)
                else:
                    tfidf_matrix[i, j] = 0  # Set TF-IDF to 0 if total_terms is 0
        return tfidf_matrix

# Create TFIDFVectorizer instance
tfidf_vectorizer = TFIDFVectorizer()

# Fit and transform corpus
features = tfidf_vectorizer.fit_transform(corpus)

vectorizer_data = open("./webapp/model/vectorizer.pkl", "wb")
pickle.dump(tfidf_vectorizer, vectorizer_data)
vectorizer_data.close()

with open('./webapp/model/vectorizer.pkl', 'rb') as tfidf:
    vectorizer = pickle.load(tfidf)

In [9]:
from sklearn.model_selection import train_test_split
x=features
y=second_col
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.2,random_state=100)


In [10]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()  
TrainData = naive_bayes.fit(train_x, train_y)

classifier_data = open("./webapp/model/classifier.pkl", "wb")
pickle.dump(naive_bayes, classifier_data)
classifier_data.close()

In [11]:
with open('./webapp/model/classifier.pkl', 'rb') as pickle_saved_data:
    unpickled_data = pickle.load(pickle_saved_data)



prediction = unpickled_data.predict(test_x)

In [12]:
def calculate_performance_metrics(true_labels, predicted_labels):
    precision = metrics.precision_score(true_labels, predicted_labels, average='weighted')
    recall = metrics.recall_score(true_labels, predicted_labels, average='weighted')
    accuracy = metrics.accuracy_score(true_labels, predicted_labels)
    f1_score = metrics.f1_score(true_labels, predicted_labels, average='weighted')
    
    return precision, recall, accuracy, f1_score

# Example usage:
precision, recall, accuracy, f1_score = calculate_performance_metrics(test_y, prediction)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1_score)


Precision: 0.6925421940637064
Recall: 0.696
Accuracy: 0.696
F1 Score: 0.672075479230363


In [15]:
def predict_sentiment(sentence):
    # Preprocess the input sentence
    cleaned_sentence = data_cleaning(sentence)
    tokenized_sentence = cleaned_sentence.split()
    stop_word_removed_sentence = stop_word_remove(tokenized_sentence)
    
    # Transform the preprocessed sentence using TF-IDF vectorizer
    sentence_features = vectorizer.transform([stop_word_removed_sentence])
    print(vectorizer.get_tif_idf_info(stop_word_removed_sentence,sentence_features))

    # Use the trained classifier to predict the sentiment label
    predicted_label = unpickled_data.predict(sentence_features)

    return cleaned_sentence,tokenized_sentence,stop_word_removed_sentence,predicted_label[0]  # Return the predicted sentiment label

# Example usage:
sentence = "म  दुखी छु"
cleaned,tokenized,sw_removed,predicted_sentiment = predict_sentiment(sentence)
print("Predicted Sentiment Label:", predicted_sentiment)
print("cleaned sentence: ", cleaned)
print("Tokenized sentence: ",tokenized)
print("Stop Word  removed sentence: ", sw_removed)


{'दुखी': 8.517193191416238}
Predicted Sentiment Label: -1
cleaned sentence:  म  दुखी छु
Tokenized sentence:  ['म', 'दुखी', 'छु']
Stop Word  removed sentence:  ['दुखी']
