In [2]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples

In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

def process_tweets(tweet):
    #remove the old style retweet text "RT"
    tweet2 = re.sub( r'^RT[\s]+' , '' ,tweet)

    #remove hyperlinks
    tweet2 = re.sub(r'https?:\/\/.*[\r\n]*' , '' ,      tweet2)

    #remove hashtag (only the # sign)
    tweet2 = re.sub(r'#' , '' ,tweet2)

    #instantiate the tokenizer class
    tokenizer = TweetTokenizer  (preserve_case=False,
                          strip_handles=True,
                          reduce_len = True)

    #tokenize tweets
    tweet_tokens = tokenizer.tokenize(tweet2)

    #importing the english stop words from nltk
    stopwords_english = stopwords.words('english')

    tweets_clean = []

    for word in tweet_tokens:
        if(word not in stopwords_english 
        and 
        word not in string.punctuation):
            tweets_clean.append(word)
    
    stemmer = PorterStemmer()

    #create an empty list to store the stems
    tweets_stem = []

    for word in tweets_clean:
        stem_word = stemmer.stem(word)  
        #stemming word
        tweets_stem.append(stem_word)
    
    return tweets_stem

In [4]:
def build_freqs(tweets , ys):
    """Build frequencies
    input:
        tweets : a list of tweets
        ys: an mx1 array with the sentiment label of each tweet(either 0 or 1)
    output:
        freqs: a dictionary mapping each (word,sentiment) pair to its frequency
        """
    
    #convert the np array to list since zip needs an iterble
    # The squeeze is necessary or the list ends up with  one element
    # also note that this is just a NOP if ys is already a list
     
    yslist = np.squeeze(ys).tolist()

    #start with an empty dictionary and populate it by looping  over all tweets
    # and over all processed words in each tweet
    freqs={}

    for y , tweet in zip(yslist , tweets):
        for word in process_tweets(tweet):
            pair =  (word,y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [5]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
# training and testing split
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg


In [7]:
#combine positive and negative labels
train_y = np.append(np.ones(( len(train_pos),1 )) , np.zeros((len(train_neg),1)),axis =0 )
test_y = np.append(np.ones(( len(test_pos),1 )) , np.zeros((len(test_neg),1)),axis =0 )

In [8]:
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [9]:
# create frequency duictionary
freqs = build_freqs(train_x,train_y)

print("type(freqs) = "+str(type(freqs)))
print("len (freqs) = "+str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len (freqs) = 11339


In [11]:
# test the function below
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweets(train_x[0]))

This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


# PART 1 LOGISTIC REGRESSION

## part 1.1 sigmoid

In [13]:
# UNQ_C1
# code to be submitted
def sigmoid(z):
    import math
    h = 1 / (1 + math.exp(-z) )
    return h

In [14]:
# Testing your function 
if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

if (sigmoid(4.92) == 0.9927537604041685):
    print('CORRECT!')
else:
    print('Oops again!')

SUCCESS!
CORRECT!


## part 1.2 cost function and gradient

In [None]:
# log(0) -> log(1) == -inf -> 0
# h(z)for ith example if equal to y gives 0
# if they are not then log gives a big positive value

#lets take y 0.99 and hz as 0
-1*( (0.992*np.log(0)) + (1-0.992)(np.log(1-0)) )