In [2]:
import pandas as pd
import nltk
import re
import string

def settingenv():
    ''' download nltk packages '''
    if not nltk.data.find('corpora/stopwords'):
        nltk.download('stopwords')
    # I wasn't able to download the word_tokenize, don't know pat
#    nltk.download('word_tokenize')
    if not nltk.data.find('tokenizers/punkt'):
        nltk.download('punkt')

def tweets(path = "tweets.csv"):
    '''import tweets csv into a pandas dataframe'''
    tweets = pd.read_csv(path)
    return tweets

def sampling(tweets_df, count=20):
    '''
    slice a sample from the original tweets
    tweets: tweets data frame
    a, b: range of sampling
    returns a corpus data frame.
    '''
    if count <= 0:
        sample = tweets_df
    else:
        sample = tweets_df.sample(n=count)
    corpus = sample.loc[:,['text']]
    corpus['text_index'] = corpus.index
    corpus.text = corpus.text.astype(str)

    return corpus

def tokenize(insent):
    '''
    input sentence, returns a list of words
    '''
    tokenlist = insent.split()
    return tokenlist

def stemming(inlist):
    '''
    input a list, returns a list of stemmed words
    '''
    outlist = []
    stemmer = nltk.SnowballStemmer('english')
    for word in inlist:
        outlist.append(stemmer.stem(word))
    return outlist

def remove_stopwords(inlist):
    '''
    input list of words, returns list of words w/o stopwords
    e.g.: the, a, here, we
    '''
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))
    outlist = []
    for word in inlist:
        if word not in stop_words:
            outlist.append(word)
    return outlist

def remove_punkt(inlist, exemption = ['@', '#']):
    '''
    input list of words, returns list of words w/o punctuation
    by default, @ and # are not removed since they are important features of tweets
    '''
    #outlist = [word for word in inlist if not re.fullmatch('[' + string.punctuation + ']+', word)]
    punktlist = string.punctuation
    for e in exemption:
        punktlist = punktlist.replace(e, '')
    outlist = []
    for word in inlist:
        word = word.translate(str.maketrans('','', punktlist))
        outlist.append(word)
    return outlist

def remove_regex(inlist, pattern = "@[\w]*"):
    '''
    use regular expression to remove words in text.
    removes @ someone by default.
    inlist: a list of words
    pattern: a list of re pattern
    '''
    regex = re.compile(pattern)
    filtered_list = [x for x in inlist if not regex.match(x)]
    return filtered_list

def create_wordcount(corpus):
    wordlist = list([a for b in corpus.tokens.tolist() for a in b])
    from collections import Counter
    counter = Counter(wordlist)
    c = dict(counter)
    # Remove spaces/invalid characters
    c.pop('')
    # Remove words that only appear relatively few times
    l=sum(c.values()) # Relative frequency instead of total count
    # Words that should stay in graph, regardless of freq. Needed to keep 'hillary'
    important_words=['hillary','clinton','donald','trump','obama','USA','vote',
    'elect','president','democrat','republican','democrats','republicans',
    'crooked','emails']
    most_common = dict(counter.most_common(30)).keys()
    word_count={k:v*100/l for k, v in c.items() if v > l/50 or k in important_words or k in most_common}
    #import code; code.interact(local=locals()) #DEBUG
    return word_count

def clean_data(sample_size, path = "tweets.csv"):

    print('setting up environment...')
    settingenv()
    print('set up environment')
    
    print('loading tweets from file...')
    tweets_df = tweets(path)
    print('loaded tweets from file')
    
    print('selecting sample of tweets...')
    corpus = sampling(count=sample_size, tweets_df=tweets_df)
    print('selected sample of tweets')
    
    print('tokenizing tweets...')
    corpus['tokens'] = corpus['text'].apply(tokenize)
    print('tokenized tweets')

    print('stemming tweets...')
    corpus['tokens'] = corpus['tokens'].apply(stemming)
    print('stemmed tweets')

    print('removing stopwords...')
    corpus['tokens'] = corpus['tokens'].apply(remove_stopwords)
    print('removed stopwords')

    print('removing puncutation...')
    corpus['tokens'] = corpus['tokens'].apply(remove_punkt)
    print('removed punctuation')

    #remove all @someone
    print('removing @someones...')
    corpus['tokens'] = corpus['tokens'].apply(remove_regex)
    print('removed @someones')

    # remove all python links
    print('removing links...')
    corpus['tokens'] = corpus['tokens'].apply(remove_regex, pattern = "https*")
    print('removed links')

    # remove all hashtags
    print('removing hashtags...')
    corpus['tokens'] = corpus['tokens'].apply(remove_regex, pattern = "#[\w]*")
    print('removed hashtags')

    # remove all retweets
    print('removing retweets')
    corpus['tokens'] = corpus['tokens'].apply(remove_regex, pattern = "rt")
    print('removed retweets')

    # create word count
    print('generating word count...')
    words=create_wordcount(corpus)
    print('generated word count')

    return words

if __name__=='__main__':
    words=clean_data(sample_size=200)
    print(words)


setting up environment...
set up environment
loading tweets from file...
loaded tweets from file
selecting sample of tweets...
selected sample of tweets
tokenizing tweets...
tokenized tweets
stemming tweets...
stemmed tweets
removing stopwords...
removed stopwords
removing puncutation...
removed punctuation
removing @someones...
removed @someones
removing links...
removed links
removing hashtags...
removed hashtags
removing retweets
removed retweets
generating word count...
generated word count
{'us': 0.40705563093622793, 'stand': 0.27137042062415195, 'trump': 2.306648575305292, 'peopl': 0.40705563093622793, 'hillari': 0.6784260515603799, 'go': 0.47489823609226595, 'vote': 0.40705563093622793, 'clinton': 1.5603799185888738, 'amp': 0.746268656716418, 'never': 0.33921302578018997, 'would': 0.33921302578018997, 'one': 0.5427408412483039, 'make': 0.40705563093622793, 'u': 0.33921302578018997, 'say': 0.40705563093622793, 'new': 0.40705563093622793, 'obama': 0.13568521031207598, 'via': 0.339

In [4]:
def match_samples(dict_one, dict_two, sample_list):
    '''Generates a pairing between the two given dicts for each value in the given list'''
    assert isinstance(dict_one, dict) and dict_one
    assert isinstance(dict_two, dict) and dict_two
    assert isinstance(sample_list, list) and sample_list
    assert all(type(sample_list[0]) == type(key) for key in dict_one.keys())
    assert all(type(sample_list[0]) == type(key) for key in dict_two.keys())
    d = {s:(dict_one[s], dict_two[s]) for s in sample_list if s in dict_one and s in dict_two}
    return d

def match_error(source_dict, other_dict, sample_list):
    ''''''
    matched = match_samples(source_dict, other_dict, sample_list)
    d = {k:((v[1]-v[0])/v[0]) for k, v in matched.items()}
    return d


In [5]:
trollWords = clean_data(sample_size=300, path='tweets.csv')

setting up environment...
set up environment
loading tweets from file...
loaded tweets from file
selecting sample of tweets...
selected sample of tweets
tokenizing tweets...
tokenized tweets
stemming tweets...
stemmed tweets
removing stopwords...
removed stopwords
removing puncutation...
removed punctuation
removing @someones...
removed @someones
removing links...
removed links
removing hashtags...
removed hashtags
removing retweets
removed retweets
generating word count...
generated word count


In [6]:
normalWords = clean_data(sample_size=300, path='election_day_tweets.csv')

setting up environment...
set up environment
loading tweets from file...
loaded tweets from file
selecting sample of tweets...
selected sample of tweets
tokenizing tweets...
tokenized tweets
stemming tweets...
stemmed tweets
removing stopwords...
removed stopwords
removing puncutation...
removed punctuation
removing @someones...
removed @someones
removing links...
removed links
removing hashtags...
removed hashtags
removing retweets
removed retweets
generating word count...
generated word count


In [7]:
matched = match_error(normalWords, trollWords, ['vote'])

In [8]:
%matplotlib inline
comparison_words=['vote', 'trump', 'hillari', 'hillary', 'clinton', 'amp']
comparisons=match_samples(normalWords, trollWords, comparison_words)
print(comparisons)
#show_comparison(comparisons)
list1 = ['hillari', 'clinton', 'hillary']
list2 = ['donald', 'trump']
show_cumulative_comparison(comparisons, list1, list2)
show_individual_comparison(comparisons, list1, list2)
show_wordcloud(words = words)
show_histogram(trollWords,title='Russian Twitter Bot Word Frequency')
show_histogram(normalWords,title='User Political Tweet Word Frequency')
show_histogram(matched,title='Word Comparison')

{'vote': (3.1346578366445916, 0.26881720430107525), 'trump': (1.5894039735099337, 1.657706093189964), 'hillari': (0.4856512141280353, 0.8512544802867383), 'hillary': (0.17660044150110377, 0.044802867383512544), 'clinton': (0.6181015452538632, 0.8512544802867383)}


NameError: name 'show_cumulative_comparison' is not defined

In [13]:
%matplotlib inline

In [11]:
import plotly
plotly.tools.set_credentials_file(username='taichifox', api_key='WdzbMDcysGxzMjJqxHUw')
import plotly.plotly as py
import plotly.graph_objs as go

def holoviewbar(comparisons):
    '''generate bar graphs using holoviews'''
    assert isinstance(comparisons, dict)
    trace1 = go.Bar(
        x = list(comparisons.keys())[0:5],
        y = [i[0] for i in list(comparisons.values())][0:5],
        name='Normal Users',
        marker=dict(
            color='rgb(0,147,250)',
        )
    )
    trace2 = go.Bar(
        x = list(comparisons.keys())[0:5],
        y = [i[1] for i in list(comparisons.values())][0:5],
        name='Russian Trolls',
        marker=dict(
            color='rgb(255,0,11)',
        )
    )

    data = [trace1, trace2]
    layout = go.Layout(
        barmode='group',
        legend = dict(
        x = 0.70,
        y = 0.95,
        font=dict(
            family='Old Standard TT, serif',
            size=23,
            color='#000'
            ),
        ),
        xaxis=go.layout.XAxis(
           tickfont=dict(
                family='Old Standard TT, serif',
                size=23,
                color='black'
            )
        ),
        yaxis=go.layout.YAxis(
            tickfont=dict(
                family='Old Standard TT, serif',
                size=23,
                color='black'
            )
        )
    )

    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename='individual_comparisons')

holoviewbar(comparisons)