In [15]:
import pandas as pd
import numpy as np

#nlp
import spacy
import re
from textblob import TextBlob

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 

#LDA / topical modeling
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

from nltk.corpus import stopwords
stop = stopwords.words('english')

#Visualizations
import plotly

In [16]:
stop += ['rt']

## Data Processing

In [3]:
def add_datepart(df, fldname, drop=True, time=False):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [4]:
tweets = pd.read_csv('../../finalprojdata/tweets.csv')
users = pd.read_csv('../../finalprojdata/users.csv')

In [5]:
fulltweets = tweets.merge(users, how='left', left_on='user_key', right_on='screen_name')

In [6]:
add_datepart(fulltweets, 'created_str')

## NLP

In [15]:
## Create a sentiment column

def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analyze_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

In [18]:
tweetsonly2 = fulltweets.text.copy().astype(str)
tweetsonly2 = tweetsonly2.str.replace('[^\w\s]','')
tweetsonly2 = tweetsonly2.str.replace('[\\r|\\n|\\t|_]',' ')
tweetsonly2 = tweetsonly2.str.strip()
tweetsonly2

fulltweets2 = fulltweets.copy()
fulltweets2.text = tweetsonly2

In [19]:
fulltweets2.text = fulltweets2.text.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop)]))
fulltweets2['Sentiment'] = np.array([analyze_sentiment(str(tweet)) for tweet in fulltweets2.text.values])

fulltweets2.text = fulltweets2.text.apply(lambda x: ' '.join([word.lower() for word in x.split() if len(word) > 3]))

In [20]:
fulltweets2.head()

Unnamed: 0,user_id,user_key,created_at_x,retweet_count,retweeted,favorite_count,text,tweet_id,source,hashtags,...,friends_count,listed_count,created_strYear,created_strMonth,created_strWeek,created_strDay,created_strDayofweek,created_strDayofyear,created_strElapsed,Sentiment
0,1868981000.0,ryanmaxwell_1,1458672000000.0,,,,islamkills trying terrorist attacks europe ref...,7.12346e+17,,"[""IslamKills""]",...,,,2016.0,3.0,12.0,22.0,1.0,82.0,1458671502,0.0
1,2571870000.0,detroitdailynew,1476133000000.0,0.0,False,0.0,clinton trump shouldve apologized attacked les...,7.855849e+17,"<a href=""http://twitterfeed.com"" rel=""nofollow...",[],...,,,2016.0,10.0,41.0,10.0,0.0,284.0,1476133020,-0.166667
2,1710805000.0,cookncooks,1487767000000.0,,,,ltapoll wasis best president past years vote r...,8.343832e+17,,[],...,,,2017.0,2.0,8.0,22.0,2.0,53.0,1487767423,0.375
3,2584153000.0,queenofthewo,1482765000000.0,,,,jww372 dont guess religion christmasaftermath,8.134006e+17,,"[""ChristmasAftermath""]",...,1427.0,11.0,2016.0,12.0,52.0,26.0,0.0,361.0,1482764801,0.0
4,1768260000.0,mrclydepratt,1501987000000.0,,,,shareblue pence lawyers decided official email...,8.940243e+17,,[],...,847.0,23.0,2017.0,8.0,31.0,6.0,6.0,218.0,1501986984,0.0


In [21]:
fulltweets2.to_csv('/Users/shsu/Downloads/fulltweets.csv')

## spacy implementation

https://nlpforhackers.io/complete-guide-to-spacy/

In [9]:
nlp = spacy.load('en')

## LDA 
https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

might be too big of data:

https://turi.com/products/create/docs/graphlab.toolkits.text_analytics.html 

In [77]:
fulltweets2.text.values

array(['islamkills trying terrorist attacks europe refugees',
       'clinton trump shouldve apologized attacked less httpstcoejampkohfz',
       'ltapoll wasis best president past years vote retweet', ...,
       'signsinyork getting right company logo businesses message across smallbusinessuk httpstcofnse5ga6dc',
       'latest obama affirms continuity ties canada httpstcoq4bznwxdss httpstcogpy8dgkiff',
       'futureguru100 cant upload online thats product worku gotta represent brand qual'], dtype=object)

In [24]:
# compile sample documents into a list
doc_set = fulltweets2.text.values.copy()

# list for tokenized documents in loop
#texts = np.array(['']*fulltweets2.shape[0])

# loop through document list
texts = [text.split(' ') for text in doc_set]

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary)

In [33]:
print(ldamodel.print_topics(num_topics=10, num_words=10))

[(23, '0.060*"america" + 0.047*"make" + 0.031*"great" + 0.029*"today" + 0.014*"sure" + 0.013*"done" + 0.012*"trump2016" + 0.012*"join" + 0.011*"plan" + 0.011*"realdonaldtrump"'), (24, '0.051*"dont" + 0.040*"know" + 0.028*"trump" + 0.027*"going" + 0.027*"white" + 0.021*"house" + 0.018*"support" + 0.018*"want" + 0.016*"people" + 0.015*"come"'), (20, '0.133*"trump" + 0.048*"donald" + 0.036*"news" + 0.032*"trumps" + 0.023*"clinton" + 0.022*"election" + 0.022*"president" + 0.020*"hillary" + 0.019*"politics" + 0.017*"media"'), (3, '0.028*"real" + 0.013*"https" + 0.011*"shows" + 0.011*"reason" + 0.010*"polls" + 0.009*"thingsmoretrustedthanhillary" + 0.009*"realdonaldtrump" + 0.009*"trump" + 0.009*"protect" + 0.008*"thehill"'), (13, '0.021*"russia" + 0.018*"rally" + 0.016*"getting" + 0.014*"talk" + 0.011*"fire" + 0.010*"christmasaftermath" + 0.010*"using" + 0.010*"florida" + 0.009*"hear" + 0.009*"tweets"'), (7, '0.086*"hillary" + 0.075*"clinton" + 0.050*"tcot" + 0.036*"pjnet" + 0.022*"bill" + 

In [2]:
!pip install pyldavis
#!pip install graphlab-create

Collecting pyldavis
Collecting funcy (from pyldavis)
  Downloading https://files.pythonhosted.org/packages/55/bd/e644bea50e4e69be1b534021dd0504de77a058aa3fa8036f8032f1dde3a3/funcy-1.10.1.tar.gz
Collecting joblib>=0.8.4 (from pyldavis)
  Downloading https://files.pythonhosted.org/packages/4f/51/870b2ec270fc29c5d89f85353da420606a9cb39fba4747127e7c7d7eb25d/joblib-0.11-py2.py3-none-any.whl (176kB)
[K    100% |████████████████████████████████| 184kB 3.2MB/s ta 0:00:01
Collecting numexpr (from pyldavis)
  Downloading https://files.pythonhosted.org/packages/33/c3/a121c7022ad2abab4f4e6a6db53468211bc5eba491841ce9fc43ff586554/numexpr-2.6.4-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (171kB)
[K    100% |████████████████████████████████| 174kB 3.1MB/s ta 0:00:011
Building wheels for collected packages: funcy
  Running setup.py bdist_wheel for funcy ... [?25ldone
[?25h  Stored in directory: /Users/shsu/Library/Caches/pip/wheels/e8

## Visualize topics

In [6]:
fulltweet = pd.read_csv('/Users/shsu/Downloads/fulltweets.csv')

In [17]:
#p_stemmer = PorterStemmer()

# compile sample documents into a list
doc_set = fulltweet.text.astype(str).values.copy()

# list for tokenized documents in loop
#texts = np.array(['']*fulltweets2.shape[0])

# loop through document list
texts = [text.split(' ') for text in doc_set]

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary)

In [43]:
dictionary.save('dictionary.dict')

corpora.MmCorpus.serialize('corpus.mm', corpus)

ldamodel.save('topic.model')


d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.LdaModel.load('topic.model')

In [18]:
import pandas as pd
import pyLDAvis
import pyLDAvis.graphlab

pyLDAvis.enable_notebook()

In [22]:
fulltweet.head()

Unnamed: 0.1,Unnamed: 0,user_id,user_key,created_at_x,retweet_count,retweeted,favorite_count,text,tweet_id,source,...,friends_count,listed_count,created_strYear,created_strMonth,created_strWeek,created_strDay,created_strDayofweek,created_strDayofyear,created_strElapsed,Sentiment
0,0,1868981000.0,ryanmaxwell_1,1458672000000.0,,,,islamkills trying terrorist attacks europe ref...,7.12346e+17,,...,,,2016.0,3.0,12.0,22.0,1.0,82.0,1458671502,0.0
1,1,2571870000.0,detroitdailynew,1476133000000.0,0.0,False,0.0,clinton trump shouldve apologized attacked les...,7.855849e+17,"<a href=""http://twitterfeed.com"" rel=""nofollow...",...,,,2016.0,10.0,41.0,10.0,0.0,284.0,1476133020,-0.166667
2,2,1710805000.0,cookncooks,1487767000000.0,,,,ltapoll wasis best president past years vote r...,8.343832e+17,,...,,,2017.0,2.0,8.0,22.0,2.0,53.0,1487767423,0.375
3,3,2584153000.0,queenofthewo,1482765000000.0,,,,jww372 dont guess religion christmasaftermath,8.134006e+17,,...,1427.0,11.0,2016.0,12.0,52.0,26.0,0.0,361.0,1482764801,0.0
4,4,1768260000.0,mrclydepratt,1501987000000.0,,,,shareblue pence lawyers decided official email...,8.940243e+17,,...,847.0,23.0,2017.0,8.0,31.0,6.0,6.0,218.0,1501986984,0.0


In [38]:
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()

data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
data



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated



## Save the entire html

In [40]:
pyLDAvis.save_html(data, 'russian.html')