# Naive Bayes

# Import Libraries

In [1]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp38-cp38-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.11.6-cp38-cp38-win_amd64.whl (274 kB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.8 nltk-3.9.1 regex-2024.11.6 tqdm-4.67.1


In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\silwa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\silwa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\silwa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\silwa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\silwa\AppData\Roaming\nltk_data...
[

True

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from collections import Counter

# Import File

In [4]:
df = pd.read_csv("data/TwitterDataset.csv", encoding='latin')

In [5]:
df.head()

Unnamed: 0,File Name,Caption,LABEL
0,1.txt,How I feel today #legday #jelly #aching #gym,negative
1,10.txt,@ArrivaTW absolute disgrace two carriages from...,negative
2,100.txt,This is my Valentine's from 1 of my nephews. I...,positive
3,1000.txt,betterfeelingfilms: RT via Instagram: First da...,neutral
4,1001.txt,Zoe's first love #Rattled @JohnnyHarper15,positive


In [6]:
df.drop(columns=["File Name"], inplace=True)
df.dropna(inplace=True)

In [7]:
df["LABEL"].value_counts()

LABEL
neutral     1771
positive    1646
negative    1452
Name: count, dtype: int64

In [8]:
df = df[~df['LABEL'].isin(['neutral'])]
df.reset_index(drop=True, inplace=True)

In [9]:
df.sample(100)

Unnamed: 0,Caption,LABEL
2396,Such an energetic group of dedicated teachers ...,positive
2549,happy birthday to my favorite diva!thank you f...,positive
1950,RT @Google_Facts_1: Gloucestershire airport in...,negative
746,RT @priyaguptatimes: Arpita is caring & giving...,positive
2472,"Consecration service of two women bishops, inc...",positive
...,...,...
874,@RenaeCollects @TheJWittz #Wittzparty Renae ha...,negative
613,Unbelievable training with this man #speechles...,negative
1787,RT @wmcdonald404: @thecatreviewer. Blinky cat'...,negative
1907,RT @johnmurphy1967: Some turnout today there w...,positive


In [10]:
df["LABEL"].value_counts()

LABEL
positive    1646
negative    1452
Name: count, dtype: int64

In [11]:
df['LABEL'] = df['LABEL'].map({'negative':0, 'positive':1})

# Preprocess

In [12]:
def remove_usernames(text):
    return re.sub(r'@\w+', '', text)

def remove_hashtags(text):
    return re.sub(r'#', '', text)

def clean_text(text):
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removing special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    # Removing extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text):
    return word_tokenize(text)

def case_folding(text):
    return text.lower()

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

def preprocess_text(text):
    text = remove_usernames(text)
    text = remove_hashtags(text)
    text = clean_text(text)
    text = case_folding(text)
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    return ' '.join(tokens)

In [13]:
text = df['Caption'][0]
print(text)
print(preprocess_text(text))

How I feel today #legday #jelly #aching #gym 
feel today legday jelli ach gym


# Preprocess

In [14]:
df.head()

Unnamed: 0,Caption,LABEL
0,How I feel today #legday #jelly #aching #gym,0
1,@ArrivaTW absolute disgrace two carriages from...,0
2,This is my Valentine's from 1 of my nephews. I...,1
3,Zoe's first love #Rattled @JohnnyHarper15,1
4,Chaotic Love - giclee print ?65 at #art #love ...,1


In [15]:
X = df['Caption'].apply(preprocess_text)
y = df['LABEL'].values

# Split

In [16]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=11)

# Helper Functions

In [17]:
def count_tweets(result, tweets, ys):

    for y, tweet in zip(ys, tweets):
        for word in tweet.split():
            # define the key, which is the word and label tuple
            pair = (word,y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1

    return result

In [18]:
freqs = count_tweets({}, x_train, y_train)
freqs

{('reason', 0): 4,
 ('petrifi', 0): 18,
 ('ocean', 0): 3,
 ('confess', 0): 1,
 ('rt', 0): 505,
 ('one', 0): 24,
 ('await', 0): 1,
 ('trial', 0): 2,
 ('jail', 0): 2,
 ('simpli', 0): 1,
 ('could', 0): 4,
 ('afford', 0): 1,
 ('bail', 0): 1,
 ('rt', 1): 597,
 ('passion', 1): 62,
 ('post', 1): 8,
 ('top', 1): 23,
 ('market', 1): 5,
 ('love', 1): 165,
 ('content', 1): 1,
 ('ufc', 0): 5,
 ('loui', 0): 3,
 ('smolka', 0): 1,
 ('irish', 0): 1,
 ('fan', 0): 8,
 ('belliger', 0): 4,
 ('go', 0): 30,
 ('australia', 0): 3,
 ('open', 0): 1,
 ('chri', 0): 4,
 ('roger', 0): 2,
 ('miss', 0): 15,
 ('train', 0): 7,
 ('today', 0): 25,
 ('check', 0): 3,
 ('dizzi', 0): 9,
 ('spell', 0): 3,
 ('second', 0): 8,
 ('test', 0): 3,
 ('lord', 0): 4,
 ('ssnhq', 0): 1,
 ('dog', 0): 19,
 ('watch', 0): 9,
 ('other', 0): 3,
 ('tortur', 0): 17,
 ('daylight', 0): 1,
 ('samsung', 0): 1,
 ('sponsor', 0): 2,
 ('guid', 0): 3,
 ('hear', 0): 6,
 ('stopbokn', 0): 6,
 ('thank', 1): 53,
 ('u', 1): 20,
 ('understand', 1): 2,
 ('heart'

In [19]:
def lookup(freqs, word, label):
    # Return frequency of word and corresponding label
    return freqs.get((word, label), 0)

In [20]:
lookup(freqs,'bad',0) 

9

# Priors and LogLikehood

Here’s how you can represent the mathematical formulas for the log prior probability and log likelihood in Python Markdown using LaTeX:

Log Prior Probability
The log prior probability can be expressed as:

markdown
Copy code
$$
\text{logprior} = \log \left( \frac{D_{pos}}{D_{neg}} \right)
$$
Where:

𝐷
𝑝
𝑜
𝑠
D 
pos
​
  is the number of positive documents.
𝐷
𝑛
𝑒
𝑔
D 
neg
​
  is the number of negative documents.
Log Likelihood
The log likelihood for a word 
𝑤
w can be expressed as:

markdown
Copy code
$$
\text{loglikelihood}(w) = \log \left( \frac{P(w \mid \text{Positive})}{P(w \mid \text{Negative})} \right)
$$
Where:

𝑃
(
𝑤
∣
Positive
)
P(w∣Positive) is the probability of word 
𝑤
w given a positive document.
𝑃
(
𝑤
∣
Negative
)
P(w∣Negative) is the probability of word 
𝑤
w given a negative document.
With additive smoothing, these probabilities are calculated as:

markdown
Copy code
$$
P(w \mid \text{Positive}) = \frac{\text{freq}_{pos}(w) + 1}{N_{pos} + V}
$$
$$
P(w \mid \text{Negative}) = \frac{\text{freq}_{neg}(w) + 1}{N_{neg} + V}
$$
Thus, the log likelihood is:

markdown
Copy code
$$
\text{loglikelihood}(w) = \log \left( \frac{\frac{\text{freq}_{pos}(w) + 1}{N_{pos} + V}}{\frac{\text{freq}_{neg}(w) + 1}{N_{neg} + V}} \right)
$$
$$
\text{loglikelihood}(w) = \log (\text{freq}_{pos}(w) + 1) - \log (\text{freq}_{neg}(w) + 1) + \log \left( \frac{N_{neg} + V}{N_{pos} + V} \right)
$$

In [21]:
lookup(freqs,'love',1)

165

In [22]:
len(freqs)

8282

In [23]:
def get_logprior(y_train):

    # Calculate D, D_pos, and D_neg
    D_pos = sum(1 for label in y_train if label == 1)
    D_neg = len(y_train) - D_pos  # Total documents - D_pos

    # Calculate logprior
    logprior = np.log(D_pos / D_neg)

    return logprior

def get_loglikelihood(freqs):
    loglikelihood = {}
    
    # Extract vocabulary and counts directly
    vocab = set(word for word, _ in freqs.keys())
    
    # Calculate N_pos, N_neg, V_pos, V_neg
    pos_counts = Counter()
    neg_counts = Counter()
    
    for (word, label), count in freqs.items():
        if label == 1:
            pos_counts[word] += count
        else:
            neg_counts[word] += count
    
    N_pos = sum(pos_counts.values())
    N_neg = sum(neg_counts.values())
    V = len(vocab)

    # Calculate loglikelihood
    for word in vocab:
        freq_pos = pos_counts[word]
        freq_neg = neg_counts[word]

        # Calculate probabilities with additive smoothing
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # Calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)
    
    return loglikelihood

In [24]:
logprior = get_logprior(y_train)
loglikelihood = get_loglikelihood(freqs)

In [25]:
print(logprior)
print(loglikelihood)

0.14391210867908333
{'check': 1.0391064669748202, 'due': -0.8326957099267712, 'timessquar': 0.9590637593012837, 'drizzl': 0.5535986511931195, 'kl': 0.5535986511931195, 'destroy': -1.0950599743942624, 'could': -0.13954852936682602, 'etsysoci': -0.8326957099267712, 'g': -0.5450136374749904, 'oscar': -1.2381608180349357, 'nike': 0.5535986511931195, 'est': 0.5535986511931195, 'nr': 0.5535986511931195, 'grung': -0.8326957099267712, 'louco': 0.5535986511931195, 'dorobabi': -0.8326957099267712, 'trick': 0.5535986511931195, 'sinner': -0.8326957099267712, 'nca': 0.5535986511931195, 'viewer': -1.2381608180349357, 'chairman': -0.8326957099267712, 'rinehart': 0.5535986511931195, 'soror': 0.5535986511931195, 'sore': -0.42723060181860684, 'loss': -1.5258428904867165, 'jarrett': 0.9590637593012837, 'vancouverisland': 0.5535986511931195, 'schizophren': -0.8326957099267712, 'biggest': 0.5535986511931195, 'paidsocialmediajob': 0.5535986511931195, 'ag': 0.5535986511931195, 'actual': -0.13954852936682588,

In [26]:
loglikelihood['sad']

-1.7489864418009262

# Now predict


### Prediction Example

Let’s classify the tweet: "I am very happy".

1. **Preprocess**: Extract words from the tweet:

   $$ \text{words} = \text{['i', 'am', 'very', 'happy']} $$

2. **Compute Log Probability**:

   Assuming the log prior and log likelihoods are:

   - **Log Prior**: \( \text{logprior} = \log (1.5) \)
   - **Log Likelihood for "happy"**: \( \text{loglikelihood}(\text{"happy"}) = \log (4) \)

   The log probability of the tweet being positive is:

   $$ \text{log\_prob} = \text{logprior} + \text{loglikelihood}(\text{"happy"}) $$

   Substituting the values:

   $$ \text{log\_prob} = \log (1.5) + \log (4) = \log (1.5 \times 4) = \log (6) $$

### Interpretation

- **Positive Log Probability**: If \( \text{log\_prob} \) is positive, it indicates the evidence is stronger for the tweet being positive.

- **Negative Log Probability**: If \( \text{log\_prob} \) were negative, it would suggest the evidence is stronger for the tweet being negative.

### Summary

- **Positive Log Probability**:

  $$ \text{log\_prob} > 0 \implies \text{Stronger evidence for the Positive Class} $$

- **Negative Log Probability**:

  $$ \text{log\_prob} < 0 \implies \text{Stronger evidence for the Negative Class} $$

By comparing the log probabilities for both classes, you can determine the final classification of the tweet.


In [27]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    
    # Preprocess the tweet to get a list of words
    words = preprocess_text(tweet)
    print(words)
    
    # Initialize the log probability with the log prior
    log_prob = logprior
    
    # Add the log likelihood of each word if it exists in the dictionary
    for word in words.split():
        print(log_prob)
        log_prob += loglikelihood.get(word,0)
        print(word,loglikelihood.get(word,0))
    return log_prob

# Test

In [28]:
# my_tweet = '@ArrivaTW absolute disgrace two carriages from Bangor half way there standing room only #disgraced'
my_tweet = "I was feeling happy but now I am sad"
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The output is', p)

feel happi sad
0.14391210867908333
feel -1.0245867177368766
-0.8806746090577933
happi 3.580102583413864
2.6994279743560705
sad -1.7489864418009262
The output is 0.9504415325551443


In [29]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0  # return this properly

    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)


    error = np.mean(np.absolute(y_hats-test_y))

    # Accuracy is 1 minus the error
    accuracy = 1-error

    return accuracy


In [30]:
accuracy = round(test_naive_bayes(x_test, y_test, logprior, loglikelihood),2)
print(f"Naive Bayes accuracy = {accuracy}" )

busi studi turn everytim hopeless
0.14391210867908333
busi -0.13954852936682588
0.0043635793122574484
studi -1.2381608180349357
-1.2337972387226783
turn 1.652210939861229
0.4184137011385507
everytim 0
0.4184137011385507
hopeless -1.1203777823785521
fill love goodtim happi nightout partytim qualiti instastyl instafashion summer sister
0.14391210867908333
fill 0.3712770943991647
0.515189203078248
love 2.7752146816534977
3.2904038847317456
goodtim 0.5535986511931195
3.844002535924865
happi 3.580102583413864
7.424105119338729
nightout 0.5535986511931195
7.977703770531848
partytim 0
7.977703770531848
qualiti 0.5535986511931195
8.531302421724968
instastyl 0
8.531302421724968
instafashion 0
8.531302421724968
summer 0.6489088309974442
9.180211252722412
sister 0.5535986511931195
dvf dian von furstenberg napl ankl soft canva blossom black tuxedo pant
0.14391210867908333
dvf 0
0.14391210867908333
dian 0
0.14391210867908333
von 0
0.14391210867908333
furstenberg 0
0.14391210867908333
napl 0
0.14391