# Sentiment Analysis on tweets for US 16 Election Analysis

### This notebook will try to analyze tweets before US 16 Elections, on May 25 until May 27. Tweets was initially fetched with a python tweepy streamer and stored at a mongoDB.

##### parsing the mongoDB databases

In [21]:
from pymongo import MongoClient


client = MongoClient("127.0.0.1:27017")

#name of the db
db = client["dbTweetsForAnalysis"]

#name of the collection
coll = db['rawTweetsForAnalysis']




##### Tokenize all text and save all words in a dictionary

In [52]:
from nltk.tokenize import RegexpTokenizer
import collections
from nltk.corpus import stopwords

import re

#just fetch all json files from mongoDB collection
cursor = coll.find()

wordList = []
dictWords = {}
tokenizer = RegexpTokenizer(pattern="[^ ]+")
english_stops = set(stopwords.words('english'))



for document in cursor:
    if "lang" in document and "text" in document and document["lang"] == "en":
        
        text =  document["text"].encode("utf-8")
        
        #cleaning the text
        text = str(text)
        text = text[2:]
        
        text = re.sub("http[s]?:*/*/*.*", "", text)
        text = re.sub("RT ", "", text)
        text = re.sub("[\\\]x.{2}", "", text)
        text = re.sub("[\\\]\'", "\'", text)
        text = re.sub("\"", "", text)
        text = re.sub("\'", "", text)
        text = re.sub("[\\\]n", "", text)
        text = re.sub("&.+;", "", text)
        text = re.sub("[a-zA-z]/", "", text)
        text = re.sub("\.", "", text)
        text = re.sub("~", "", text)
        text = str.lower(text)
        #print(text,"\n")
        
        words =  tokenizer.tokenize(text)
        for word in words:
            if word not in english_stops:
                if word not in dictWords:
                    dictWords[word] = 1
                    wordList.append(word)
                else:
                    dictWords[word] += 1


        
               

#sort the dict by key
#dictWords = collections.OrderedDict(sorted(dictWords.items()))

#for item, value in dictWords.items():
#    if value > 10:
#        print(item,":", value,"\n")

#print(wordList)
        

client.close()

##### Removing repeating words

In [51]:
from nltk.corpus import wordnet

class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(.*)(.)\2(.*)')
        self.repl = r'\1\2\3'
    
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        
        repl_word = self.repeat_regexp.sub(self.repl, word)
    
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word
    
    

replacer = RepeatReplacer()    
for word in wordList:
    wordList[wordList.index(word)] = replacer.replace(word)
    
    
print (wordList)

KeyboardInterrupt: 

##### Stemming the words

In [48]:
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()
stemmedWordList = []

for word in wordList:
    
    stemmedWordList.append(stemmer.stem(word))
    
#print (stemmedWordList)

['#makehilarydebateagain', '@hilaryclinton', 'promised,', 'debate!', 'flak', 'debate,', 'what', 'next?', '#justsaying', 'wha!', '@realdonaldtrump', 'choos', 'color!', '@keitholberman:', 'cal', 'dea', 'vint', 'fost', 'fishy,', 'thu', 'question:', 'account', 'whereabout', '@marv_vien:', 'trump', 'nra', 'got', 'noth', '@sybrinafulton,', 'moth', 'mov', '#stopgunviolence', '@davidaxelrod', '@sensanders', 'numb', 'declin', '#as', 'us', '#bad', 'candid', '@realdonaldtrump:', 'thank', 'america!', '#trump2016', '@salon', 'vot', 'clintonsbl', 'presidenthil', 'maidand', 'cle', 'americano', 'repeatsvot', '@foxnews:', 'tough', 'vigil', 'smart,', 'going', 'big', 'troubl', '@realthebernison:', 'berniesanders:', 'disappoint', 'surpr', 'secret', 'clinton', 'unwil', 'deb', 'larg', 'ht', 'saw', 'day1', 'anoint', 'god', '4this', 'point', 'tim', 'help', '2return', 'olandogod', '10%', '#potustrump', '2016', 'peopl', 'chang', 'horseshit', 'thing', 'anyway!', 'efem!', '@reince', '@seanhanity', '@lrihendry', '

##### Lemmatizing words

In [49]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
LemmaWordList = []

for word in wordList:
    
    LemmaWordList.append(lemmatizer.lemmatize(word))
    
#print (LemmaWordList)

['#makehilarydebateagain', '@hilaryclinton', 'promised,', 'debate!', 'flake', 'debate,', 'whats', 'next?', '#justsaying', 'wha!', '@realdonaldtrump', 'choose', 'color!', '@keitholberman:', 'calling', 'death', 'vince', 'foster', 'fishy,', 'thus', 'question:', 'account', 'whereabouts', '@marv_vien:', 'trump', 'nra', 'got', 'nothing', '@sybrinafulton,', 'mother', 'movement', '#stopgunviolence', '@davidaxelrod', '@sensanders', 'number', 'decline', '#as', 'usual', '#bad', 'candidate', '@realdonaldtrump:', 'thank', 'america!', '#trump2016', '@salon', 'vote', 'clintonsbil', 'presidenthilary', 'maidand', 'cleaned', 'americano', 'repeatsvote', '@foxnews:', 'tough', 'vigilant', 'smart,', 'going', 'big', 'trouble', '@realthebernison:', 'berniesanders:', 'disappointed', 'surprised', 'secretary', 'clinton', 'unwillingness', 'debate', 'large', 'ht', 'saw', 'day1', 'anointed', 'god', '4this', 'point', 'time', 'help', '2return', 'olandogod', '10%', '#potustrump', '2016', 'people', 'change', 'horseshit