# Buildung a lexicon out of negative and positive tweets

In order to find not only sentiment on unigrams or complete sentences, we need build a lexicon that covers ngrams(n>=2). We achieve this by finding bi- and trigrams in a positive and negative. Afterwards the corresponding sentiment score will be calculated.

In [152]:
import pandas as pd
import numpy as np
import re

dfPos = pd.read_csv("data/positive_dataset.csv")
dfNeg = pd.read_csv("data/negative_dataset.csv")
dfPos = dfPos.drop("target", 1)
dfNeg = dfNeg.drop("target", 1)

First step is to clean the data from mentions, retweets or links

In [153]:
def clean_tweets(df):
    df["text"]=df["text"].str.replace("@mention","").str.strip()
    df["text"]=df["text"].str.replace("@\w*","").str.strip()
    df["text"]=df["text"].str.replace("#\w*","").str.strip()
    df["text"]=df["text"].str.replace("http.*[\w\s]*","").str.strip()
    df["text"]=df["text"].str.replace("RT","").str.strip()
    return df

In [154]:
dfPos = clean_tweets(dfPos)
dfNeg = clean_tweets(dfNeg)
print(dfNeg.head(5))
print(dfPos.head(5))

                                                text
0  So there is no way for me to plug it in here i...
1  Tied to charger for conversations lasting more...
2  I have to jiggle the plug to get it to line up...
3  If you have several dozen or several hundred c...
4                Needless to say, I wasted my money.
                                          text
0                  Good case, Excellent value.
1                       Great for the jawbone.
2                            The mic is great.
3  If you are Razr owner...you must have this!
4              And the sound quality is great.


Having a cleaned dataset we define a method to extract ngrams

In [155]:
import nltk
from nltk.util import ngrams
from nltk.collocations import *
from textblob import TextBlob

def extract_ngrams(data, n):
    tempList=[]
#     for tweets in data["text"].iteritems():
    for tweets in data["text"]:
#         one_string = one_string + tweets
#         tweet = TextBlob(str(tweets))
#         print(tweet)
        grams = ngrams(tweets.split(), n)
#         print(grams)
        for gram in grams:
            tempList.append(gram)
    return tempList

In [156]:
bigramsPos = extract_ngrams(dfPos, 2)
bigramsNeg = extract_ngrams(dfNeg, 2)
trigramsNeg= extract_ngrams(dfNeg, 3)
trigramsPos = extract_ngrams(dfPos, 3)

We build dictionaries for bi-/trigrams and count their occurences

In [157]:
from collections import Counter

cntTrigramsPos = Counter(trigramsPos)
cntBigramsPos = Counter(bigramsPos)
cntBigramsNeg = Counter(bigramsNeg)
cntTrigramsNeg = Counter(trigramsNeg)

# Remove margins!!!

In [201]:
# compare keys in dictionaries and use formula below to build sentiment score and create lexicon
cntTrigramsPos.most_common(3)

[(('This', 'is', 'a'), 7),
 (('I', 'love', 'this'), 6),
 (('I', 'am', 'very'), 5)]

In [202]:
cntTrigramsNeg.most_common(3)

[(('I', 'had', 'to'), 6),
 (('work', 'with', 'my'), 4),
 (('I', 'could', 'not'), 4)]

In [203]:
trigramsPosSet = set(cntTrigramsPos)
trigramsNegSet = set(cntTrigramsNeg)
bigramsPosSet = set(cntBigramsPos)
bigramsNegSet = set(cntBigramsNeg)

In [193]:
def calculate_sentiment_score(nrPos, nrNeg):
    sentiment_score = (nrPos - nrNeg) / (nrPos + nrNeg)
    return sentiment_score

In [196]:
# TRIGRAMS
scoresTrigrams={}
for i in trigramsPosSet.intersection(trigramsNegSet):
#     try:
    scoresTrigrams[i] = calculate_sentiment_score(cntTrigramsPos[i], cntTrigramsNeg[i])
#     except ZeroDivisionError:
#         print("ZeroDivisionError:", i,cntBigramsPos[i], cntBigramsNeg[i])

In [204]:
# BIGRAMS
scoresBigrams= {}
for i in bigramsNegSet.intersection(bigramsPosSet):
    scoresBigrams[i]= calculate_sentiment_score(cntBigramsPos[i], cntBigramsNeg[i])

In [None]:
# FILTER OUT SCORES E(􀀀-0:1; 0:1).

In [205]:
scoresBigrams

{('-', 'It'): 0.0,
 ('All', 'in'): 0.3333333333333333,
 ('BT', 'headset'): 0.0,
 ('Battery', 'is'): 0.0,
 ('Battery', 'life'): 0.3333333333333333,
 ('Everything', 'about'): 0.0,
 ('I', 'am'): 0.4117647058823529,
 ('I', 'bought'): 0.09090909090909091,
 ('I', 'can'): 0.14285714285714285,
 ('I', 'could'): -0.4,
 ('I', "couldn't"): 0.0,
 ('I', 'did'): -0.3333333333333333,
 ('I', 'did.'): 0.0,
 ('I', "don't"): -0.6,
 ('I', 'ended'): 0.0,
 ('I', 'even'): 0.0,
 ('I', 'exchanged'): 0.0,
 ('I', 'found'): -0.3333333333333333,
 ('I', 'got'): 0.6,
 ('I', 'had'): -0.16666666666666666,
 ('I', 'have'): 0.1111111111111111,
 ('I', 'like'): 0.5,
 ('I', 'made'): 0.0,
 ('I', 'purchased'): 0.0,
 ('I', 'really'): 0.2,
 ('I', 'received'): 0.5,
 ('I', 'started'): 0.0,
 ('I', 'still'): 0.0,
 ('I', 'think'): 0.0,
 ('I', 'thought'): 0.3333333333333333,
 ('I', 'use'): 0.0,
 ('I', 'used'): 0.0,
 ('I', 'want'): 0.0,
 ('I', 'was'): -0.17647058823529413,
 ('I', 'would'): 0.38461538461538464,
 ("I'm", 'very'): 0.0,
 (

In [200]:
scoresTrigrams

{('All', 'in', 'all,'): 0.0,
 ('I', 'bought', 'this'): 0.5,
 ('I', 'can', 'say'): 0.0,
 ('I', 'could', 'not'): -0.6,
 ('I', 'did', 'not'): 0.0,
 ('I', 'ended', 'up'): 0.0,
 ('I', 'got', 'the'): 0.0,
 ('I', 'had', 'to'): -0.7142857142857143,
 ('I', 'have', 'ever'): 0.3333333333333333,
 ('I', 'have', 'had'): 0.0,
 ('I', 'have', 'to'): -0.3333333333333333,
 ('I', 'like', 'it.'): 0.0,
 ('I', 'received', 'my'): 0.3333333333333333,
 ('I', 'think', 'it'): 0.0,
 ("I've", 'had', 'this'): -0.3333333333333333,
 ('If', 'you', 'are'): 0.0,
 ('It', 'was', 'a'): 0.0,
 ('Thank', 'you', 'for'): 0.0,
 ('The', 'battery', 'is'): -0.3333333333333333,
 ('The', 'case', 'is'): 0.0,
 ('The', 'design', 'is'): 0.0,
 ('The', 'sound', 'quality'): 0.3333333333333333,
 ('They', 'do', 'not'): 0.0,
 ('This', 'is', 'a'): 0.5555555555555556,
 ('This', 'is', 'the'): 0.0,
 ('This', 'product', 'is'): 0.0,
 ('a', 'couple', 'of'): 0.0,
 ('a', 'few', 'days'): 0.0,
 ('a', 'good', 'quality'): 0.0,
 ('a', 'lot', 'better'): 0.0,
