In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk
from nltk.corpus import twitter_samples

In [None]:
#Download Twitter samples
nltk.download('twitter_samples')
print("Fields ",twitter_samples.fileids())

In [None]:
# Get positive and negative tweets from respective json files
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [None]:
# Take a look at sample tweets
positive_tweets[:5]

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from nltk.tokenize import TweetTokenizer
#Function for preprocessing the data and to get feature vectors
def process_tweet(tweet):
    """
    Generate tokens from the given tweet
    input: 
        tweet: A string containing a tweet
    output: Tokens after
        clean_tokens: Tokens of the processed tweet
    """
    stop_words = stopwords.words('english')
    stemmer = PorterStemmer()
    #Remove hyperlinks, Retweet "RT" text, #, $stock tickers
    tweet = re.sub(r'\$\w*','',tweet) # Removes words like $StockTicker #\w --> word character
    tweet = re.sub(r'RT\s*','',tweet) # Remove RT text # \s --> space character
    tweet = re.sub(r'https?:\/\/\S*','',tweet) # Removes links # ? --> 0 or 1 occurence of previous charcter #\S any character except space characters (opposite of \s)
    tweet = re.sub(r'#','',tweet)
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
    tokens = tokenizer.tokenize(tweet)
    clean_tokens = []
    for word in tokens:
        if (word not in stop_words) and (word not in string.punctuation):
            clean_tokens.append(stemmer.stem(word))
    return clean_tokens


In [None]:
#Function for preprocessing the data and to get feature vectors
def build_frequencies(tweets, sentiments):
    """
    Build positive and negative frequencies of each word in the corpus
    Input:
        tweets: A list of tweets
        sentiments: a list of corresponding sentiments
    Output:
        freqs: A dictionary containing frequencies --> (word,sentiment):frequency
    """
    freqs = {}
    for tweet,sentiment in zip(tweets,sentiments):
        for token in process_tweet(tweet):
            freqs[(token,sentiment)] = freqs.get((token,sentiment),0) + 1
        
    return freqs

In [None]:
#Combine positive and negative tweets and prepare train and test sets:
#positive_tweets.extend(negative_tweets)
all_tweets = positive_tweets + negative_tweets
all_sentiments = np.ones(5000).tolist() + np.zeros(5000).tolist()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_tweets,all_sentiments,test_size=0.2,random_state=101)
print("length of training tweets : ",len(X_train))
print("length of testing tweets : ",len(X_test))
for tweet,label in zip(X_train[:5],y_train[:5]):
    print(" {} {} ".format(tweet,label))

In [None]:
freqs = build_frequencies(X_train, y_train)

**Logistic Regression:**
$$z = \theta_0 x_0 + \theta_1 x_1 + \theta_2 x_2 + ... \theta_N x_N$$

Sigmoid
$$ h(z) = \frac{1}{1+\exp^{-z}}$$ 

Cost Function
$$J(\theta) = -\frac{1}{m} \sum_{i=1}^m y^{(i)}\log (h(z(\theta)^{(i)})) + (1-y^{(i)})\log (1-h(z(\theta)^{(i)})) $$

Gradient of the Cost function:
$$\nabla_{\theta_j}J(\theta) = \frac{1}{m} \sum_{i=1}^m(h^{(i)}-y^{(i)})x_j $$

Update the weights by subtracting fraction of derivative of loss function from weights:
$$\theta_j = \theta_j - \alpha \times \nabla_{\theta_j}J(\theta) $$

In [None]:
def gradient_descent(x,y,theta,alpha, num_iters):
    """
    Perform gradient descent of logistic regression
    Input:
        x: feature matrix(m,n+1), n+1 --> number of features including the bias
        y: True label(m,1)
        theta: Weights(n+1,1)
        alpha: Learning rate
        num_iters: Number of iteration you want to train your model for
    Output:
        J: Final loss after num_iters
        theta: Adjusted weights after num_iters
    """
    m = x.shape[0]
    for i in range(num_iters):
        z = np.dot(x,theta)
        h = 1/(1+np.exp(-z))
        J = -1./m * (np.dot(y.transpose(),np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))
        theta = theta - (alpha/m) * np.dot(x.transpose(), (h - y))
        print(" Loss {} in iteration {}".format(J, i))
    J = float(J)
    return J, theta
    

In [None]:
def extract_features(tweet,freqs):
    """
    Extract features for each tweet using the freqs dictionary
    Input: 
    tweet: A string containing a tweet
    freqs: Frequency dictionary containing frequencies for each word -->(word,label):freq
    Output:
    x: feature vector(1,3)
    """
    x = np.zeros((1,3))
    x[0,0] = 1 #bias
    for word in process_tweet(tweet):
        x[0,1] += freqs.get((word,1),0)
        x[0,2] += freqs.get((word,0),0)
    return x

In [None]:
# extract features from each tweet and append to feature matrix X
m = len(X_train)
X = np.zeros((m,3))
for i in range(m):
    X[i,:] = extract_features(X_train[i],freqs)
Y = np.array(y_train)
Y = Y.reshape(-1,1)
print("Y shape: ",Y.shape)
#Gradient Descent
J, theta = gradient_descent(X,Y,np.zeros((3,1)),1e-9,1000)
print("Final training loss ",J)

In [None]:
def predict_tweet(tweet, freqs, theta):
    """
    Input:
    tweet: a string containg a tweet
    freqs: frequency dictionary containing +ve,-ve frequencies of all words in the corpus
    theta" trained weights
    Output:
    pred: probability of input tweet being +ve or -ve
    """
    x = extract_features(tweet, freqs)
    pred = 1/(1+np.exp(-(np.dot(x,theta))))
    return pred

In [None]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print(" {}  {} ".format(tweet, predict_tweet(tweet, freqs,theta)))

In [None]:
#Check performance on test set
y_hat = []
for tweet in X_test:
    y_hat.append(predict_tweet(tweet, freqs,theta) >0.5)
accuracy = (np.squeeze(y_hat) == np.squeeze(y_test)).sum()/len(X_test)
print("Test accuracy of twitter analysis is {}%: ".format(accuracy*100))