In [10]:
import pandas as pd
import numpy as np

#nlp
import spacy
import re
from textblob import TextBlob

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 

#LDA / topical modeling
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

from nltk.corpus import stopwords
stop = stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

#Visualizations
import plotly

import pyLDAvis
#import pyLDAvis.graphlab

pyLDAvis.enable_notebook()

In [11]:
def add_datepart(df, fldname, drop=True, time=True):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [75]:
#read in the data
tweets = pd.read_csv('../../finalprojdata/tweets.csv')
users = pd.read_csv('../../finalprojdata/users.csv')

In [76]:
users = users.drop_duplicates('id')

In [77]:
len(users)

394

In [78]:
tweets.head(3)

Unnamed: 0,user_id,user_key,created_at,created_str,retweet_count,retweeted,favorite_count,text,tweet_id,source,hashtags,expanded_urls,posted,mentions,retweeted_status_id,in_reply_to_status_id
0,1868981000.0,ryanmaxwell_1,1458672000000.0,2016-03-22 18:31:42,,,,#IslamKills Are you trying to say that there w...,7.12346e+17,,"[""IslamKills""]",[],POSTED,[],,
1,2571870000.0,detroitdailynew,1476133000000.0,2016-10-10 20:57:00,0.0,False,0.0,"Clinton: Trump should’ve apologized more, atta...",7.855849e+17,"<a href=""http://twitterfeed.com"" rel=""nofollow...",[],"[""http://detne.ws/2e172jF""]",POSTED,[],,
2,1710805000.0,cookncooks,1487767000000.0,2017-02-22 12:43:43,,,,RT @ltapoll: Who was/is the best president of ...,8.343832e+17,,[],[],POSTED,[],,


In [79]:
users.head(3)

Unnamed: 0,id,location,name,followers_count,statuses_count,time_zone,verified,lang,screen_name,description,created_at,favourites_count,friends_count,listed_count
0,18710816.0,near Utah Ave & Lighthouse an,Robby Delaware,304.0,11484.0,Pacific Time (US & Canada),False,en,RobbyDelaware,"I support the free movement of people, ideas a...",Wed Jan 07 04:38:02 +0000 2009,17.0,670.0,13.0
1,100345056.0,still ⬆️Block⤵️Corner⬇️street,#Ezekiel2517✨...,1053.0,31858.0,,False,en,SCOTTGOHARD,CELEBRITY TRAINER ✨#424W147th✨ #CrossfitCoach ...,Tue Dec 29 23:15:22 +0000 2009,2774.0,1055.0,35.0
2,247165706.0,"Chicago, IL",B E C K S T E R✨,650.0,6742.0,Mountain Time (US & Canada),False,en,Beckster319,Rebecca Lynn Hirschfeld Actress.Model.Writer.A...,Fri Feb 04 06:38:45 +0000 2011,7273.0,896.0,30.0


In [80]:
tweets.drop(['tweet_id','retweeted_status_id', 'in_reply_to_status_id', 'created_at', 'expanded_urls'], axis=1, inplace=True)

In [81]:
fulltweets = tweets.merge(users, how='left', left_on='user_id', right_on='id')

In [84]:
fulltweets = fulltweets[pd.notnull(fulltweets['user_id'])]
fulltweets = fulltweets[pd.notnull(fulltweets['created_str'])]
fulltweets = fulltweets[pd.notnull(fulltweets['friends_count'])]
fulltweets = fulltweets[pd.notnull(fulltweets['time_zone'])]

In [85]:
len(fulltweets)

185160

In [86]:
fulltweets3.isnull().sum()

user_id                  0
user_key                 0
created_str              0
retweet_count       134149
retweeted           134149
favorite_count      134149
text                     0
source              134149
hashtags                 0
posted                   0
mentions                 0
id                       0
location             20155
name                     0
followers_count          0
statuses_count           0
time_zone                0
verified                 0
lang                     0
screen_name              0
description          11813
created_at               0
favourites_count         0
friends_count            0
listed_count             0
dtype: int64

In [87]:
#fix time
add_datepart(fulltweets, 'created_str')

In [88]:
## Create a sentiment column
def analyze_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

In [89]:
tweetsonly2 = fulltweets.text.copy().astype(str)
tweetsonly2 = tweetsonly2.str.replace('[^\w\s]','')
tweetsonly2 = tweetsonly2.str.replace('[\\r|\\n|\\t|_]',' ')
tweetsonly2 = tweetsonly2.str.strip()

fulltweets2 = fulltweets.copy()
fulltweets2.text = tweetsonly2

In [90]:
stop += ['rt']
fulltweets2.text = fulltweets2.text.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop)]))
fulltweets2['Sentiment'] = np.array([analyze_sentiment(str(tweet)) for tweet in fulltweets2.text.values])

In [91]:
fulltweets2.text = fulltweets2.text.apply(lambda x: ' '.join([word.lower() for word in x.split() if len(word) > 3]))

In [92]:
fulltweets2.to_csv('/Users/shsu/Downloads/fulltweets2.csv')