Sentiment Analysis with Logistic Regression


In [36]:
import sklearn
import numpy as np
import pandas as pd


In [37]:
train = pd.read_csv('resources/datasets/train1.csv')
test = pd.read_csv('resources/datasets/test1.csv')
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [38]:
test.tail()

Unnamed: 0,id,tweet
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."
17196,49159,"my song ""so glad"" free download! #shoegaze ..."


In [39]:
import nltk

positive_only = train['label']==1
negative_only = train['label']==0
positive_tweets = train[positive_only]
negative_tweets = train[negative_only]
example_postive_tweet=positive_tweets
example_negative_tweet=negative_tweets
test_pos = positive_tweets[2000:]
train_pos = positive_tweets[:2000]
test_neg = negative_tweets[20000:]
train_neg = negative_tweets[:4000]
# train_x = train_pos + train_neg 
# test_x = test_pos + test_neg
train_x = pd.concat([train_pos,train_neg],axis=0)
test_x = pd.concat([test_pos,test_neg],axis=0)
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [41]:
# data cleaning
import re                                  
import string
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer
nltk.download('stopwords')
def text_process(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer()
    tweet_tokenized = tokenizer.tokenize(tweet)
    stopwords_english = stopwords.words('english') 
    tweet_processsed=[word for word in tweet_tokenized 
    if word not  in stopwords_english and word not in       
    string.punctuation]
    stemmer = PorterStemmer() 
    tweet_after_stem=[]
    for word in tweet_processsed:
        word=stemmer.stem(word)
        tweet_after_stem.append(word)
    return tweet_after_stem

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
# Frequency distribution
pos_words=[]
for tweet in positive_tweets['tweet']:
    #print(tweet)
    tweet=text_process(tweet)
    
    for word in tweet:
        
        pos_words.append(word)
freq_pos={}
for word in pos_words:
    if (word,1) not in freq_pos:
        freq_pos[(word,1)]=1
    else:
        freq_pos[(word,1)]=freq_pos[(word,1)]+1
neg_words=[]
for tweet in negative_tweets['tweet']:
    tweet=text_process(tweet)
    
    for word in tweet:
        
        neg_words.append(word)
freq_neg={}
for word in neg_words:
    if (word,0) not in freq_neg:
        freq_neg[(word,0)]=1
    else:
        freq_neg[(word,0)]=freq_neg[(word,0)]+1
freqs_dict = dict(freq_pos)
freqs_dict.update(freq_neg)

In [43]:
# feature extraction
train_x['label'].value_counts()


0    4000
1    2000
Name: label, dtype: int64

In [44]:
import numpy as np
def features_extraction(tweet, freqs_dict):
  
    print(tweet)
    word_l = text_process(tweet)
    x = np.zeros((1, 3))
    x[0,0] = 1 
    for word in word_l:
        try:
            x[0,1] += freqs_dict[(word,1)]
        except:
            x[0,1] += 0
        try: 
            x[0,2] += freqs_dict[(word,0.0)]
        except:
            x[0,2] += 0
    assert(x.shape == (1, 3))
    return x
X = np.zeros((len(train_x), 3))
    
for i in range(int(len(train_x)*2/3)):
 X[i, :]= features_extraction(train_x['tweet'][i], freqs_dict)

 @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked
  bihday your majesty
#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  
 factsguide: society now    #motivation
[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  
 @user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams   #hate #imagine #actorslife #revolutionschool #girl
we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â¦ 
 @user @user welcome here !  i'm   it's so #gr8 ! 
 â #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #gold #forex
we are so selfish. #orlando #standwith

In [17]:
int(len(train_x)*2/3)

4000

In [18]:
# sigmoid function
def sigmoid(x): 
    h = 1/(1+np.exp(-x))
    return h

def gradientDescent_algo(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -1/m*(np.dot(y.T,np.log(h))+np.dot((1-y).T,np.log(1-h)))
        theta = theta-(alpha/m)*np.dot(x.T,h-y)
    J = float(J)
    return J, theta

In [19]:
# training
X = np.zeros((len(train_x), 3))
for i in range(int(len(train_x)*2/3)):
    X[i, :]= features_extraction(train_x['tweet'][i], freqs_dict)
Y = train_y
J, theta = gradientDescent_algo(X, Y, np.zeros((3, 1)), 1e-9, 1500)

 @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked
  bihday your majesty
#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  
 factsguide: society now    #motivation
[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  
 @user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams   #hate #imagine #actorslife #revolutionschool #girl
we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â¦ 
 @user @user welcome here !  i'm   it's so #gr8 ! 
 â #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #gold #forex
we are so selfish. #orlando #standwith

In [32]:
# testing the accuracy and predicting
def predict(tweet, freqs_dict, theta):
    x = features_extraction(tweet,freqs_dict)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred
def test_accuracy(test_x, test_y, freqs_dict, theta):
    y_hat = []
    for tweet in test_x:
        
        y_pred = predict(tweet, freqs_dict, theta)
        
        if y_pred > 0.5:
           
            y_hat.append(1)
        else:
            
            y_hat.append(0)
    m=len(y_hat)
    y_hat=np.array(y_hat)
    y_hat=y_hat.reshape(m)
    test_y=test_y.reshape(m)
    
    c=y_hat==test_y
    j=0
    for i in c:
        if i==True:
            j=j+1
    accuracy = j/m
    return accuracy
accuracy = test_accuracy(test_x['tweet'], test_y, freqs_dict, theta)

@user #allahsoil islamophobia only serves to empower the extremists.  â¦   #emiratis #2016in4words #Ùâ¦
@user kevin macdonald:  ethnocentrism is normal and rational     #race #ethnocentrism @user
@user @user @user is this the kind of tweet you want a company rep sending out?  
#trump #newyork co-chair  '#gorilla' comment about #michelleobama 'damn #asians'  #draintheswamp
please #america # #timwise lecture - #colorblind  via @user #trendolizer  #historicalamnesia
  #history #america true progress
what #millennials think about :  
this should have been enough 2 just say no #notmypres #wtf2016 #neverump #unpresidented #bigot  ... 
here's what ignorance &amp;  looks like. it ain't all swastikas &amp; burning crosses... 
you're not very sma if you think i hate all "white males." i hate racism and sexism. that's it. grow a brain, right wingers!  #p2
@user @user crackers? isn't that a little racist? i mean i'm white but think the kkk should burn in hâ¦ 
@user #trump is guilty of treason 

In [33]:
accuracy

0.5887372013651877