# Importing Data

In [336]:
import pandas as pd
import numpy as nm
import nltk
import collections
from collections import Counter
from nltk.stem.porter import *  
from nltk.corpus import stopwords
import string
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

In [337]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shubhangisrivastava/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shubhangisrivastava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Reading Data

In [338]:
df = pd.read_csv('./train.txt', header=None,names= ["ItemID", "Sentiment", "SentimentSource", "SentimentText"],
                 skiprows=1)
print(df)

test_df = pd.read_csv('./test.txt', header=None,names= ["ItemID", "Sentiment", "SentimentSource", "SentimentText"],
                 skiprows=1)

print(test_df)

       ItemID  Sentiment SentimentSource  \
0           1          0    Sentiment140   
1           2          0    Sentiment140   
2           3          1    Sentiment140   
3           4          0    Sentiment140   
4           5          0    Sentiment140   
...       ...        ...             ...   
89984   89996          1    Sentiment140   
89985   89997          1    Sentiment140   
89986   89998          1    Sentiment140   
89987   89999          0    Sentiment140   
89988   90000          0    Sentiment140   

                                           SentimentText  
0                           is so sad for my APL frie...  
1                         I missed the New Moon trail...  
2                                omg its already 7:30 :O  
3                .. Omgaga. Im sooo  im gunna CRy. I'...  
4               i think mi bf is cheating on me!!!   ...  
...                                                  ...  
89984  @clevercatsknit Re: gnome hat. Was the problem...  

# Cleaning Data

In [339]:
words_to_remove = stopwords.words('english')
puncs = list(string.punctuation)
words_to_remove.extend(puncs)
words_to_remove.extend(['``','""',"''","...","\'",'\"',"\t","\b","\r","\f","\n"])
to_strip = string.punctuation+"."+"\'"+'\"'+"\t"+"\b"+"\r"+"\f"+"\n"

In [340]:
def get_tokens(sentence):
    tokens = nltk.word_tokenize(sentence)
    tokens = [word.lower().strip(to_strip) for word in tokens if word.lower() not in set(words_to_remove)]
    return tokens

# Tokenization

In [341]:
df["SentimentText"] = df["SentimentText"].map(lambda x: get_tokens(sentence = x))
test_df["SentimentText"] = test_df["SentimentText"].map(lambda x: get_tokens(sentence = x))

In [342]:
print(df)
print(test_df)

       ItemID  Sentiment SentimentSource  \
0           1          0    Sentiment140   
1           2          0    Sentiment140   
2           3          1    Sentiment140   
3           4          0    Sentiment140   
4           5          0    Sentiment140   
...       ...        ...             ...   
89984   89996          1    Sentiment140   
89985   89997          1    Sentiment140   
89986   89998          1    Sentiment140   
89987   89999          0    Sentiment140   
89988   90000          0    Sentiment140   

                                           SentimentText  
0                                     [sad, apl, friend]  
1                           [missed, new, moon, trailer]  
2                                   [omg, already, 7:30]  
3      [, omgaga, im, sooo, im, gunna, cry, ve, denti...  
4                         [think, mi, bf, cheating, t_t]  
...                                                  ...  
89984  [clevercatsknit, gnome, hat, problem, finished...  

# Stemming

In [343]:
stemmer = PorterStemmer()

df["SentimentText"] = df["SentimentText"].apply(lambda x: [stemmer.stem(i) for i in x])
test_df["SentimentText"] = test_df["SentimentText"].apply(lambda x: [stemmer.stem(i) for i in x])

In [344]:
print(df["SentimentText"])
print(test_df["SentimentText"])

0                                       [sad, apl, friend]
1                               [miss, new, moon, trailer]
2                                     [omg, alreadi, 7:30]
3        [, omgaga, im, sooo, im, gunna, cri, ve, denti...
4                              [think, mi, bf, cheat, t_t]
                               ...                        
89984    [clevercatsknit, gnome, hat, problem, finish, ...
89985    [clevercatsknit, saw, linn, bakeri, thought, v...
89986                           [cleverdaisi, would, love]
89987                                    [cleverick, evid]
89988    [cleverindi, spine, thing, sound, good, back, ...
Name: SentimentText, Length: 89989, dtype: object
0       [clevermonkey, hit, magnolia, see, anvil, stor...
1       [clevermonkey, seen, boot, car, movi, goodth, ...
2              [clevertitania, good, morn, rain, thunder]
3                                   [clewis4u91, glad, m]
4                             [clexisep, mss, sad, cooki]
           

In [345]:
df["Sentimentstemmed"] = df["SentimentText"].apply(lambda x: ' '.join([w for w in x]))
test_df["SentimentText"] = test_df["SentimentText"].apply(lambda x: ' '.join([w for w in x]))

In [346]:
print(df)
print(test_df)

       ItemID  Sentiment SentimentSource  \
0           1          0    Sentiment140   
1           2          0    Sentiment140   
2           3          1    Sentiment140   
3           4          0    Sentiment140   
4           5          0    Sentiment140   
...       ...        ...             ...   
89984   89996          1    Sentiment140   
89985   89997          1    Sentiment140   
89986   89998          1    Sentiment140   
89987   89999          0    Sentiment140   
89988   90000          0    Sentiment140   

                                           SentimentText  \
0                                     [sad, apl, friend]   
1                             [miss, new, moon, trailer]   
2                                   [omg, alreadi, 7:30]   
3      [, omgaga, im, sooo, im, gunna, cri, ve, denti...   
4                            [think, mi, bf, cheat, t_t]   
...                                                  ...   
89984  [clevercatsknit, gnome, hat, problem, finish

# TF-IDF

In [347]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf(train, test):
    
    vectorizer = TfidfVectorizer(max_df = 0.90 , min_df = 10, stop_words = 'english')
    tfidf = vectorizer.fit_transform(train).toarray()
    tfidf_test = vectorizer.transform(test).toarray()
    return tfidf , tfidf_test



In [348]:
score , score_test = get_tfidf(df["Sentimentstemmed"], test_df["SentimentText"] )


In [349]:
print(score)
print(score_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [350]:
score

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Logistic Regression

In [351]:
X = score
y = df["Sentiment"]


def sigmoid(X, weight):
    z = nm.dot(X, weight)
    return 1 / (1 + nm.exp(-z))


In [352]:
def gradient_descent(X, h, y):
    return nm.dot(X.T, (h - y)) / y.shape[0]

def update_weight_loss(weight, learning_rate, gradient):
    return weight - learning_rate * gradient


In [353]:
def grad(X, y):
    num_iter = 100
   
    theta = nm.zeros(X.shape[1])
 
    for i in range(num_iter):
        h = sigmoid(X, theta)
        gradient = gradient_descent(X, h, y)
        theta = update_weight_loss(theta, 0.1, gradient)
    return theta

In [354]:
def predict_test(x, theta):
    theta_1 = theta[:, nm.newaxis]
    return sigmoid(x,theta_1)

def acc(actual, pred):
    predicted_class = ((pred >= 0.5) .astype(int))
    predicted_class = predicted_class.flatten()
    accuracy = nm.mean(predicted_class == actual)
    return accuracy

## K fold Cross Validation

In [355]:

kfold = KFold(10)
bestaccuracy = 0
theta_2 = nm.zeros(X.shape[1])

for train, test in kfold.split(X):
    X_train = X[train]
    X_validate = X[test]
    
    Y_train = y[train]
    Y_validate = y[test]
    
    theta_3 = grad(X_train, Y_train)
    
    pred = predict_test(X_validate, theta_3)
    
    accuracy = acc(Y_validate, pred)
    
    if(accuracy > bestaccuracy):
        theta_2 = theta_3
        bestaccuracy = accuracy

In [356]:
print(theta_2)
print(bestaccuracy)

[1.69669714e-05 4.11901541e-04 1.91843575e-04 ... 4.60787387e-05
 9.36247465e-05 2.26691401e-04]
0.7196355150572286


In [357]:
test_final = predict_test(score_test, theta_2)

In [358]:
test_final

array([[0.50058368],
       [0.5000251 ],
       [0.50322637],
       ...,
       [0.50115861],
       [0.50473716],
       [0.50536043]])

In [359]:
actual_y = test_df["Sentiment"]

accuracy = acc(actual_y, test_final)
accuracy

0.7081

In [360]:
precision = average_precision_score(actual_y, test_final)
test_final_1 = ((test_final >= 0.5) .astype(int))

prec = precision_score(actual_y, test_final_1)
recall = recall_score(actual_y, test_final_1)
conf = confusion_matrix(actual_y, test_final_1)

print('Average precision-recall score: {0:0.2f}'.format(precision))
print(prec)
print(recall)

Average precision-recall score: 0.81
0.7062927680128808
0.8782115448782115
