In [None]:
!pip install contractions
!pip install opendatasets

In [None]:
import opendatasets as od
import pandas

od.download(
    "https://www.kaggle.com/datasets/kazanova/sentiment140?resource=download")

### preprocessing

In [3]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from collections import Counter
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
df = pd.read_csv('/content/sentiment140/training.1600000.processed.noemoticon.csv',header=None ,encoding='latin-1')
df.columns = ['target','ids','date','flag','user','text']

df_type1 = df[df['target'] == 0]
df_type2 = df[df['target'] == 4]

df_type1_sample = df_type1.sample(n=5000, random_state=42)
df_type2_sample = df_type2.sample(n=5000, random_state=42)

final_df = pd.concat([df_type1_sample, df_type2_sample])
final_df = final_df.sample(frac=1, random_state=42)

final_df['target'] = final_df['target'].replace(4, 1)

df = final_df.copy()

nltk.download('stopwords')
nltk.download('punkt')

# Lowercasing
df['text'] = df['text'].str.lower()

# Tokenization
df['text'] = df['text'].apply(word_tokenize)

# Stemming (Using Porter Stemmer from NLTK)
stemmer = nltk.PorterStemmer()
df['text'] = df['text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

# Removing Numbers
df['text'] = df['text'].apply(lambda tokens: [token for token in tokens if not token.isdigit()])

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_df['text'], test_df['text'],train_df['target'], test_df['target']

### Term Frequency

In [None]:
class CustomTF:
    def __init__(self, documents):
        self.documents = documents
        self.vocab = set()
        self.term_freq_matrix = None
        self.epsilon = 1e-8  # Small constant for unseen words
        self.smoothing = 1

    def _calculate_tf(self, document):
        term_freqs = Counter(document)
        total_words = len(document)
        unique_words = len(set(document))
        tf = {term: (freq + self.smoothing) / (total_words + unique_words) for term, freq in term_freqs.items()}
        return tf

    def fit_transform(self):
        for doc in self.documents:
            self.vocab.update(doc)

        self.vocab = list(self.vocab)
        self.term_freq_matrix = []

        for doc in self.documents:
            tf = self._calculate_tf(doc)
            doc_vector = [tf[word] if word in tf else 0 for word in self.vocab]
            self.term_freq_matrix.append(doc_vector)

        return np.array(self.term_freq_matrix)

    def transform(self, documents):
        term_freq_matrix = []
        for doc in documents:
            tf = self._calculate_tf(doc)
            doc_vector = [tf.get(word, self.epsilon) for word in self.vocab]
            term_freq_matrix.append(doc_vector)
        return np.array(term_freq_matrix)


In [None]:
custom_tf = CustomTF(X_train)
X_train_tf = custom_tf.fit_transform()


X_test_tf = custom_tf.transform(X_test)


naive_bayes_tf = MultinomialNB()
naive_bayes_tf.fit(X_train_tf, y_train)


y_pred_tf = naive_bayes_tf.predict(X_test_tf)


accuracy_tf = accuracy_score(y_test, y_pred_tf)
print("Accuracy using term frequencies for MultinomialNB:", accuracy_tf)


precision_tf = precision_score(y_test, y_pred_tf)
recall_tf = recall_score(y_test, y_pred_tf)
f1_tf = f1_score(y_test, y_pred_tf)

print("Precision using term frequencies for MultinomialNB:", precision_tf)
print("Recall using term frequencies for MultinomialNB:", recall_tf)
print("F1-score using term frequencies for MultinomialNB:", f1_tf)


naive_bayes_gaussian = GaussianNB()
naive_bayes_gaussian.fit(X_train_tf, y_train)


y_pred_gaussian = naive_bayes_gaussian.predict(X_test_tf)


accuracy_gaussian = accuracy_score(y_test, y_pred_gaussian)
print("Accuracy using term frequencies for GaussianNB:", accuracy_gaussian)


precision_gaussian = precision_score(y_test, y_pred_gaussian)
recall_gaussian = recall_score(y_test, y_pred_gaussian)
f1_gaussian = f1_score(y_test, y_pred_gaussian)

print("Precision using term frequencies for GaussianNB:", precision_gaussian)
print("Recall using term frequencies for GaussianNB:", recall_gaussian)
print("F1-score using term frequencies for GaussianNB:", f1_gaussian)


Accuracy using term frequencies for MultinomialNB: 0.7585
Precision using term frequencies for MultinomialNB: 0.7680851063829788
Recall using term frequencies for MultinomialNB: 0.7315096251266464
F1-score using term frequencies for MultinomialNB: 0.7493513233004672
Accuracy using term frequencies for GaussianNB: 0.5585
Precision using term frequencies for GaussianNB: 0.6256038647342995
Recall using term frequencies for GaussianNB: 0.2624113475177305
F1-score using term frequencies for GaussianNB: 0.3697359029264811


### TF-IDF


In [None]:
class CustomTFIDF:
    def __init__(self, documents, smoothing=1):
        self.documents = documents
        self.vocab = set()
        self.idf = None
        self.smoothing = smoothing
        self.epsilon = 1e-3  # Small constant for unseen words

    def _calculate_tf(self, document):
        term_freqs = Counter(document)
        total_words = len(document)
        tf = {term: freq/total_words for term, freq in term_freqs.items()}
        return tf

    def _calculate_idf(self):
        num_documents = len(self.documents)
        idf = {}
        for word in self.vocab:
            doc_count = sum([1 for doc in self.documents if word in doc])
            idf[word] = np.log((num_documents + self.smoothing) / (doc_count + self.smoothing))  # Additive smoothing
        return idf

    def fit(self):
        for doc in self.documents:
            self.vocab.update(doc)
        self.vocab = list(self.vocab)
        self.idf = self._calculate_idf()

    def transform(self, documents):
        doc_term_matrix = []
        for doc in documents:
            tf = self._calculate_tf(doc)
            doc_vector = [(tf.get(word, self.epsilon) * self.idf.get(word, np.finfo(float).eps)) for word in self.vocab]
            doc_term_matrix.append(doc_vector)
        return np.array(doc_term_matrix)


In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


custom_tfidf = CustomTFIDF(X_train, smoothing=1)
custom_tfidf.fit()


X_train_tfidf = custom_tfidf.transform(X_train)
X_test_tfidf = custom_tfidf.transform(X_test)


naive_bayes_multinomial = MultinomialNB()
naive_bayes_multinomial.fit(X_train_tfidf, y_train)


y_pred_multinomial = naive_bayes_multinomial.predict(X_test_tfidf)


accuracy_multinomial = accuracy_score(y_test, y_pred_multinomial)
print("Accuracy (MultinomialNB):", accuracy_multinomial)


precision_multinomial = precision_score(y_test, y_pred_multinomial)
recall_multinomial = recall_score(y_test, y_pred_multinomial)
f1_multinomial = f1_score(y_test, y_pred_multinomial)

print("Precision (MultinomialNB):", precision_multinomial)
print("Recall (MultinomialNB):", recall_multinomial)
print("F1-score (MultinomialNB):", f1_multinomial)


naive_bayes_gaussian = GaussianNB()
naive_bayes_gaussian.fit(X_train_tfidf, y_train)


y_pred_gaussian = naive_bayes_gaussian.predict(X_test_tfidf)


accuracy_gaussian = accuracy_score(y_test, y_pred_gaussian)
print("Accuracy (GaussianNB):", accuracy_gaussian)


precision_gaussian = precision_score(y_test, y_pred_gaussian)
recall_gaussian = recall_score(y_test, y_pred_gaussian)
f1_gaussian = f1_score(y_test, y_pred_gaussian)

print("Precision (GaussianNB):", precision_gaussian)
print("Recall (GaussianNB):", recall_gaussian)
print("F1-score (GaussianNB):", f1_gaussian)


Accuracy (MultinomialNB): 0.751
Precision (MultinomialNB): 0.7472194135490394
Recall (MultinomialNB): 0.7487335359675785
F1-score (MultinomialNB): 0.7479757085020242
Accuracy (GaussianNB): 0.555
Precision (GaussianNB): 0.6125290023201856
Recall (GaussianNB): 0.2674772036474164
F1-score (GaussianNB): 0.3723554301833569


### PPMI

In [5]:
class CustomPPMI:
    def __init__(self):
        self.vocab = set()
        self.word_to_index = {}
        self.index_to_word = {}
        self.co_occurrence_matrix = None
        self.ppmi_matrix = None

    def _build_co_occurrence_matrix(self, documents, window_size=2):
        word_count = len(self.vocab)
        self.co_occurrence_matrix = np.zeros((word_count, word_count))

        for doc in documents:
            for i, word in enumerate(doc):
                if word in self.vocab:
                    current_index = self.word_to_index[word]
                    start = max(0, i - window_size)
                    end = min(len(doc), i + window_size + 1)
                    context = doc[start:end]
                    context.remove(word)
                    for context_word in context:
                        if context_word in self.vocab:
                            context_index = self.word_to_index[context_word]
                            self.co_occurrence_matrix[current_index][context_index] += 1

    def _calculate_ppmi(self):
        total_occurrences = np.sum(self.co_occurrence_matrix)
        word_count = len(self.vocab)
        self.ppmi_matrix = np.zeros((word_count, word_count))

        for i in range(word_count):
            for j in range(word_count):
                if self.co_occurrence_matrix[i][j] == 0:
                    self.ppmi_matrix[i][j] = 0
                else:
                    word_i_freq = np.sum(self.co_occurrence_matrix[i])
                    word_j_freq = np.sum(self.co_occurrence_matrix[j])
                    joint_freq = self.co_occurrence_matrix[i][j]
                    pmi = np.log2((joint_freq * total_occurrences) / (word_i_freq * word_j_freq))
                    self.ppmi_matrix[i][j] = max(pmi, 0)

    def fit(self, documents, window_size=2):
        for doc in documents:
            self.vocab.update(doc)

        self.vocab = list(self.vocab)

        for i, word in enumerate(self.vocab):
            self.word_to_index[word] = i
            self.index_to_word[i] = word

        self._build_co_occurrence_matrix(documents, window_size)
        self._calculate_ppmi()

    def transform(self, documents):
        ppmi_vectors = []
        for doc in documents:
            ppmi_vector = np.zeros((len(self.vocab),))  # Initialize PPMI vector
            for word in doc:
                if word in self.vocab:
                    word_index = self.word_to_index[word]
                    ppmi_vector += self.ppmi_matrix[word_index]  # Accumulate PPMI values
            ppmi_vectors.append(ppmi_vector)
        return np.array(ppmi_vectors)



In [6]:
custom_ppmi = CustomPPMI()
custom_ppmi.fit(X_train)


X_train_ppmi = custom_ppmi.transform(X_train)
X_test_ppmi = custom_ppmi.transform(X_test)


naive_bayes_multinomial = MultinomialNB()
naive_bayes_multinomial.fit(X_train_ppmi, y_train)


y_pred_multinomial = naive_bayes_multinomial.predict(X_test_ppmi)


accuracy_multinomial = accuracy_score(y_test, y_pred_multinomial)
print("Accuracy (MultinomialNB):", accuracy_multinomial)


precision_multinomial = precision_score(y_test, y_pred_multinomial)
recall_multinomial = recall_score(y_test, y_pred_multinomial)
f1_multinomial = f1_score(y_test, y_pred_multinomial)

print("Precision (MultinomialNB):", precision_multinomial)
print("Recall (MultinomialNB):", recall_multinomial)
print("F1-score (MultinomialNB):", f1_multinomial)


naive_bayes_gaussian = GaussianNB()
naive_bayes_gaussian.fit(X_train_ppmi, y_train)


y_pred_gaussian = naive_bayes_gaussian.predict(X_test_ppmi)


accuracy_gaussian = accuracy_score(y_test, y_pred_gaussian)
print("Accuracy (GaussianNB):", accuracy_gaussian)


precision_gaussian = precision_score(y_test, y_pred_gaussian)
recall_gaussian = recall_score(y_test, y_pred_gaussian)
f1_gaussian = f1_score(y_test, y_pred_gaussian)

print("Precision (GaussianNB):", precision_gaussian)
print("Recall (GaussianNB):", recall_gaussian)
print("F1-score (GaussianNB):", f1_gaussian)


Accuracy (MultinomialNB): 0.639
Precision (MultinomialNB): 0.6417112299465241
Recall (MultinomialNB): 0.60790273556231
F1-score (MultinomialNB): 0.6243496357960457
Accuracy (GaussianNB): 0.653
Precision (GaussianNB): 0.6890322580645162
Recall (GaussianNB): 0.541033434650456
F1-score (GaussianNB): 0.6061293984108966
