# Sentiment Analysis for Twitter

### Imports

In [1]:
import nltk
import collections
import numpy as np
import pandas as pd
import nltk.classify.util
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.metrics import precision, recall
from wordcloud import WordCloud, STOPWORDS
from nltk.classify import SklearnClassifier
from nltk.classify import NaiveBayesClassifier
from sklearn.model_selection import train_test_split

In [2]:
def get_train_df():
    columns_dataset1 = ['Index', 'sentiment', 'Source', 'text']
    train_set_1 = pd.read_csv('../Tweet Sentiment Datasets/dataset1.csv',
                              error_bad_lines=False, warn_bad_lines=False)

    columns_dataset2 = ['sentiment', 'tweetId', 'Date&Time', 'query', 'user', 'text']
    train_set_2 = pd.read_csv('../Tweet Sentiment Datasets/dataset2.csv', encoding = "latin1",
                              error_bad_lines=False, warn_bad_lines=False, header=None)

    columns_dataset3 = ['sentiment', 'text']
    train_set_3 = pd.read_csv('../Tweet Sentiment Datasets/dataset3.tsv', sep='\t',
                              error_bad_lines=False, warn_bad_lines=False)
    
    train_set_1.columns = columns_dataset1
    train_set_2.columns = columns_dataset2
    train_set_3.columns = columns_dataset3
    
    train_set_2['sentiment'] = train_set_2.sentiment.apply(lambda x: 1 if x==4 else x)
    
    train_df = train_set_1[['sentiment', 'text']]
    train_df = pd.concat([train_df, train_set_2[['sentiment', 'text']]], ignore_index=True)
    train_df = pd.concat([train_df, train_set_3[['sentiment', 'text']]], ignore_index=True)
    
    train_df['sentiment'] = train_df.sentiment.apply(lambda x: int(x))
    train_df['text'] = train_df.text.apply(lambda x: x.split(' '))
    
    return train_df

In [3]:
train_df = get_train_df()
train, test = train_test_split(train_df, test_size=0.1)

print(train.shape, test.shape)

(2866976, 2) (318553, 2)


In [4]:
train_pos = train[train['sentiment'] == 1]['text']
train_neg = train[train['sentiment'] == 0]['text']
test_pos = test[test['sentiment'] == 1]['text']
test_neg = test[test['sentiment'] == 0]['text']

In [5]:
def word_feats(words):
    return dict([(word, True) for word in words])
 
def evaluate_classifier(featx, train_neg, train_pos, test_neg, test_pos):
    train_neg_feats = [(featx(word_list), 'neg') for word_list in train_neg]
    train_pos_feats = [(featx(word_list), 'pos') for word_list in train_pos]
    
    test_neg_feats = [(featx(word_list), 'neg') for word_list in test_neg]
    test_pos_feats = [(featx(word_list), 'pos') for word_list in test_pos]
 
    trainfeats = train_neg_feats + train_pos_feats
    testfeats = test_neg_feats + train_pos_feats
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
    print('pos precision:', precision(refsets['pos'], testsets['pos']))
    print('pos recall:', recall(refsets['pos'], testsets['pos']))
    print('neg precision:', precision(refsets['neg'], testsets['neg']))
    print('neg recall:', recall(refsets['neg'], testsets['neg']))
    classifier.show_most_informative_features()

In [6]:
evaluate_classifier(word_feats, train_neg, train_pos, test_neg, test_pos)

accuracy: 0.8283657367683067
pos precision: 0.9038404858355096
pos recall: 0.7352573824605227
neg precision: 0.7765431488684791
neg recall: 0.9216381749274557
Most Informative Features
                 me..its = True              pos : neg    =    154.1 : 1.0
                 bummed. = True              neg : pos    =    131.9 : 1.0
                   Died! = True              neg : pos    =     95.2 : 1.0
             @Banksyart2 = True              pos : neg    =     89.5 : 1.0
                Fuzzball = True              pos : neg    =     74.2 : 1.0
                     228 = True              neg : pos    =     65.1 : 1.0
                     447 = True              neg : pos    =     63.5 : 1.0
                  Farrah = True              neg : pos    =     62.4 : 1.0
                 McMahon = True              neg : pos    =     62.3 : 1.0
                  sad!!! = True              neg : pos    =     61.8 : 1.0


## Another Implementation

In [None]:
def wordcloud_draw(data, color='black'):
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and not word.startswith('#')
                                and word != 'RT'
                            ])
    wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=2500,
                      height=2000
                     ).generate(cleaned_word)
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

print("Positive words")
wordcloud_draw(train_pos, 'white')
print("Negative words")
wordcloud_draw(train_neg)

In [None]:
def clean_tweets(train):
    tweets = []
    stopwords_set = set(stopwords.words("english"))

    for row in train.itertuples():
        words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
        words_cleaned = [word for word in words_filtered
            if 'http' not in word
            and '://' not in word
            and not word.startswith('@')
            and not word.startswith('#')
            and word not in stopwords_set
            and word != 'RT']
        tweets.append((words_cleaned, row.sentiment))
    
    return tweets

# Extracting word features
def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in w_features:
        features['containts(%s)' % word] = (word in document_words)
    return features

# tweets = clean_tweets(train_)
# w_features = get_word_features(get_words_in_tweets(tweets))
# wordcloud_draw(w_features)

In [None]:
training_set = nltk.classify.apply_features(extract_features, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [None]:
print('accuracy:', nltk.classify.util.accuracy(classifier, test))
classifier.show_most_informative_features()

In [None]:
neg_cnt = 0
pos_cnt = 0
for obj in test_neg: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'Negative'): 
        neg_cnt = neg_cnt + 1
for obj in test_pos: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'Positive'): 
        pos_cnt = pos_cnt + 1
        
print('[Negative]: %s/%s '  % (len(test_neg),neg_cnt))        
print('[Positive]: %s/%s '  % (len(test_pos),pos_cnt))    