In [12]:
import re
import nltk
import utils
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from os import getcwd
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from utils import process_tweet, build_freqs, sigmoid, gradientDescent, test_logistic_regression, extract_features, predict_tweet

In [2]:
# nltk.download('stopwords')
# nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shenchingfeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/shenchingfeng/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [13]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

### Train-Test Split

In [14]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [15]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis = 0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis = 0)

# Data Preprocessing

```python
build_freqs(tweets, ys):
    return freqs
```

```python
tweets: a list of tweets
ys: an m x 1 array with the sentiment label of each tweet (Positive: 1 ; Negative: 0)
freqs: a dictionary mapping each (word, sentiment) pair to its frequency
```

Calculate the frequency of specific word in Positive Tweet & Negative Tweet respectively

In [16]:
freqs = build_freqs(train_x, train_y)
freqs

{('followfriday', 1.0): 23,
 ('france_int', 1.0): 1,
 ('pkuchli', 1.0): 1,
 ('57', 1.0): 2,
 ('milipol_pari', 1.0): 1,
 ('top', 1.0): 30,
 ('engag', 1.0): 7,
 ('member', 1.0): 14,
 ('commun', 1.0): 27,
 ('week', 1.0): 72,
 (':)', 1.0): 2960,
 ('lamb', 1.0): 1,
 ('2ja', 1.0): 1,
 ('hey', 1.0): 60,
 ('jame', 1.0): 7,
 ('odd', 1.0): 2,
 (':/', 1.0): 5,
 ('pleas', 1.0): 81,
 ('call', 1.0): 27,
 ('contact', 1.0): 4,
 ('centr', 1.0): 1,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 6,
 ('assist', 1.0): 1,
 ('mani', 1.0): 28,
 ('thank', 1.0): 522,
 ('despiteoffici', 1.0): 1,
 ('listen', 1.0): 15,
 ('last', 1.0): 39,
 ('night', 1.0): 55,
 ('bleed', 1.0): 2,
 ('amaz', 1.0): 41,
 ('track', 1.0): 5,
 ('scotland', 1.0): 2,
 ('97side', 1.0): 1,
 ('congrat', 1.0): 15,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 2,
 ('verifi', 1.0): 2,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 57,
 ('blue', 1.0): 8,
 ('tick', 1.0): 1,
 ('mark', 1.0): 2,
 ('fb', 1.0): 4,
 ('profil', 1.0): 2,
 (

# Model Training

In [17]:
X = np.zeros((len(train_x), 3))

for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

Y = train_y

In [20]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, int(1e4))

print(f"The cost after training is {J:.4f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.1013.
The resulting vector of weights is [3e-07, 0.00127474, -0.0011083]


In [22]:
import numpy as np
import matplotlib.pyplot as plt

# Define a range of alpha and num_iters values to explore
alpha_values = [1e-7, 1e-8, 1e-9, 1e-10]
num_iters_values = [2000, 20000, 2000000]

# Create a grid of alpha and num_iters values
alpha, num_iters = np.meshgrid(alpha_values, num_iters_values)

# Initialize an empty array to store the corresponding cost values
cost_values = np.zeros_like(alpha)

# Perform gradient descent for each combination of alpha and num_iters
for i in range(len(alpha_values)):
    for j in range(len(num_iters_values)):
        J, _ = gradientDescent(X, Y, np.zeros((3, 1)), alpha[i, j], int(num_iters[i, j]))
        cost_values[i, j] = J

# Create a contour plot
plt.figure(figsize=(10, 6))
contour = plt.contourf(alpha, num_iters, cost_values, levels=20, cmap='viridis')
plt.colorbar(contour, label='Cost (J)')
plt.xlabel('Learning Rate (alpha)')
plt.ylabel('Number of Iterations (num_iters)')
plt.title('Cost vs. Learning Rate and Number of Iterations')
plt.show()

  J = -1/m * (np.dot(y.T, np.log(h)) + np.dot((1 - y).T, np.log(1 - h)))


KeyboardInterrupt: 

# Model Testing

In [9]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print('%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.799225
I am bad -> 0.422531
this movie should have been great. -> 0.763367
great -> 0.759815
great great -> 0.908929
great great great -> 0.969219
great great great great -> 0.990034


In [10]:
vali_tweet = [

    "Another day, another opportunity.",

    "Do the right things, do things right.",

    "Celebrate the journey, not just the destination.",

    "Every sunset is an opportunity to reset.",

    "Stars can not shine without darkness.",

    "Inhale courage, exhale fear.",

    "Radiate kindness like sunshine.",

    "Find beauty in the ordinary.",

    "Chase your wildest dreams with the heart of a lion.",

    "Life is a canvas; make it a masterpiece.",

    "Let your soul sparkle.",

    "Create your own sunshine.", 

    "This summer would not be perfect without you." ]


for tweet in vali_tweet:
    print('Stem:', process_tweet(tweet))
    print('%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))
    print('\n')

Stem: ['anoth', 'day', 'anoth', 'opportun']
Another day, another opportunity. -> 0.725077


Stem: ['right', 'thing', 'thing', 'right']
Do the right things, do things right. -> 0.558169


Stem: ['celebr', 'journey', 'destin']
Celebrate the journey, not just the destination. -> 0.504761


Stem: ['everi', 'sunset', 'opportun', 'reset']
Every sunset is an opportunity to reset. -> 0.567657


Stem: ['star', 'shine', 'without', 'dark']
Stars can not shine without darkness. -> 0.499589


Stem: ['inhal', 'courag', 'exhal', 'fear']
Inhale courage, exhale fear. -> 0.501253


Stem: ['radiat', 'kind', 'like', 'sunshin']
Radiate kindness like sunshine. -> 0.606686


Stem: ['find', 'beauti', 'ordinari']
Find beauty in the ordinary. -> 0.547009


Stem: ['chase', 'wildest', 'dream', 'heart', 'lion']
Chase your wildest dreams with the heart of a lion. -> 0.472705


Stem: ['life', 'canva', 'make', 'masterpiec']
Life is a canvas; make it a masterpiece. -> 0.508972


Stem: ['let', 'soul', 'sparkl']
Let you

In [11]:
accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {accuracy:.4f}")

Logistic regression model's accuracy = 0.9945
