In [1]:
import re
import tweepy
import pandas as pd
from textblob import TextBlob
from sklearn import preprocessing
import plotly.offline as offline
offline.init_notebook_mode(connected=True)

In [20]:
def write_FROM_user_tweets_to_file(screen_name):

    print('Writing to file for '+screen_name)
    
    # fetch tweets
    tweets = api.user_timeline(screen_name, count=200)
    
    try:
        
        # open file
        with open('from_user_tweets_celebrities.csv', 'a') as myfile:
                
            # loop through tweets
            for tweet in tweets:
                
                # check if tweet in file
                if str(tweet.id) not in open('from_user_tweets_celebrities.csv').read():
                                        
                    s = '{},{},{},{},{},{}\n'.format(tweet.id,
                                                     tweet.created_at,
                                                     tweet.favorite_count,
                                                     tweet.retweet_count,
                                                     tweet.user.screen_name, 
                                                     tweet.user.followers_count)        
                    # write to file
                    myfile.write(s)
                
    except tweepy.TweepError as e:
            # print error (if any)
            print("Error : " + str(e))
            
def get_df_FROM_user_tweets():
    
    # read in data
    from_user_df = pd.read_csv('from_user_tweets_celebrities.csv', header=None, engine='python')

    # name columns
    from_user_df.columns = ['tweet_id','created_at','favorite_count','retweet_count','screen_name','followers_count']

    # set tweet_id as index
    from_user_df.set_index('tweet_id', inplace=True)

    # remove duplicate tweets
    from_user_df = from_user_df[~from_user_df.index.duplicated(keep='first')]
    
    # RFR
    from_user_df['rfr'] = (from_user_df['favorite_count']+from_user_df['retweet_count'])/from_user_df['followers_count']
        
    # screen_name index for plotting colors
    le = preprocessing.LabelEncoder()
    le.fit(from_user_df['screen_name'])
    from_user_df['screen_name_ind'] = le.transform(from_user_df['screen_name'])
        
    # normalize
    tmp = from_user_df.drop(['created_at','screen_name','screen_name_ind'],axis=1)
    normalized = (tmp-tmp.min())/(tmp.max()-tmp.min())
    normalized['screen_name']=from_user_df['screen_name']
    normalized['screen_name_ind']=5*from_user_df['screen_name_ind']
    normalized['created_at']=from_user_df['created_at']
    
    return normalized

In [27]:
def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def write_ABOUT_user_tweets_to_file(screen_name):
    
    print('Writing to file for '+screen_name)
    
    # fetch tweets
    tweets = api.search(screen_name, lang='en', count=50)
    
    try:
        
        # open file
        with open('about_user_tweets_celebrities.csv', 'a') as myfile:
                
            # loop through tweets
            for tweet in tweets:
                
                clean_text = clean_tweet(tweet.text)
                analysis = TextBlob(clean_text)
                polarity = analysis.sentiment.polarity
                
                # make sure tweet not already in file
                if str(tweet.id) not in open('about_user_tweets_celebrities.csv').read():
                                        
                    s = '{},{},{},{},{},{},{},{}\n'.format(tweet.id, 
                                                        tweet.created_at, 
                                                        clean_text, 
                                                        polarity, 
                                                        polarity*(0.5*tweet.favorite_count+tweet.retweet_count),
                                                        tweet.favorite_count, 
                                                        tweet.retweet_count, 
                                                        screen_name)        
                    # write to file
                    myfile.write(s)
                
    except tweepy.TweepError as e:
            # print error (if any)
            print("Error : " + str(e))
            
def get_df_ABOUT_user_tweets():
    
    # read in data
    about_user_df = pd.read_csv('about_user_tweets_celebrities.csv', header=None)

    # name columns
    about_user_df.columns = ['tweet_id','created_at','text','polarity_ind','polarity_agg','favorite_count','retweet_count','screen_name']

    # set tweet_id as index
    about_user_df.set_index('tweet_id', inplace=True)

    # remove duplicate tweets
    about_user_df = about_user_df[~about_user_df.index.duplicated(keep='first')]
    
    return about_user_df

In [22]:
def ternary_plot_all(df, title):

    data = [{ 
        'type': 'scatterternary',
        'mode': 'markers',
        'opacity': .5,
        'a': df['retweet_count'],
        'b': df['rfr'],
        'c': df['favorite_count'],
        'text': df['screen_name'],
        'marker': {
            'symbol': 'o',
            'color': df['screen_name_ind'],
            'autocolorscale': True,
            'size': 14
        }
    }]

    layout = {
        'ternary': {
            'sum': 1,
            'aaxis': makeAxis('Retweets'),
            'baxis': makeAxis('<br>RFR'),
            'caxis': makeAxis('<br>Favorites')
        },
        'annotations': [{
            'showarrow': False,
            'text': title,
            'x': 0.5,
            'y': 1.3,
            'font': { 'size': 35 }
        }],
    }

    fig = {'data': data, 'layout': layout}
    offline.iplot(fig, validate=False)
    
def ternary_plot_avg(df, title):

    data = [{ 
        'type': 'scatterternary',
        'mode': 'markers',
        'opacity': .5,
        'a': df['retweet_count'],
        'b': df['rfr'],
        'c': df['favorite_count'],
        'text': df.index,
        'marker': {
            'symbol': 'o',
            'color': df['screen_name_ind'],
            'autocolorscale': True,
            'size': 14
        }
    }]

    layout = {
        'ternary': {
            'sum': 1,
            'aaxis': makeAxis('Retweets'),
            'baxis': makeAxis('<br>RFR'),
            'caxis': makeAxis('<br>Favorites')
        },
        'annotations': [{
            'showarrow': False,
            'text': title,
            'x': 0.5,
            'y': 1.3,
            'font': { 'size': 35 }
        }],
    }

    fig = {'data': data, 'layout': layout}
    offline.iplot(fig, validate=False)
    
def makeAxis(title): 
    return {
      'title': title,
      'titlefont': { 'size': 20 },
      'tickfont': { 'size': 15 },
      'tickcolor': 'rgba(0,0,0,0)',
      'ticklen': 5,
      'showline': True,
      'showgrid': True
    }

### Tweepy Setup

In [23]:
consumer_key = 'n9LMcL7CRMtaTY5TXMp1VfIKo'
consumer_secret = 'G0ghn8E8TJPCKl29AfmA4019U1hq6NhGQFoMsJ05CARnmkeE7U'
access_token = '1959972582-gfpDYaAbKj7c412HOalcL0jQv0QdhJtgwZguXjl'
access_token_secret = 'nZJVEdDSHsCZvV8dvRtXBjOoDIzeKOSKyvtaavjeV5ARK'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api  = tweepy.API(auth)

# Tweets FROM User

In [24]:
people = pd.read_csv('people_list.csv')

In [25]:
celeb_screen_names = list(people[people['industry']=='celebrity']['screen_name'])
celeb_screen_names

['TheRock',
 'tomhanks',
 'prattprattpratt',
 'LeoDiCaprio',
 'jimmyfallon',
 'HereIsGina',
 'EmmaWatson',
 'AnnaKendrick',
 'TheEllenShow',
 'Oprah',
 'Caitlyn_Jenner',
 'DrOz',
 'piersmorgan',
 'carliesheen',
 'KevinSpacey',
 'chelseahandler',
 'lindsaylohan',
 'KimKardashian',
 'GwynethPaltrow',
 '_KrisJStewart']

In [26]:
[write_FROM_user_tweets_to_file(screen_name) for screen_name in celeb_screen_names]

Writing to file for TheRock
Writing to file for tomhanks
Writing to file for prattprattpratt
Writing to file for LeoDiCaprio
Writing to file for jimmyfallon
Writing to file for HereIsGina
Writing to file for EmmaWatson
Writing to file for AnnaKendrick
Writing to file for TheEllenShow
Writing to file for Oprah
Writing to file for Caitlyn_Jenner
Writing to file for DrOz
Writing to file for piersmorgan
Writing to file for carliesheen
Writing to file for KevinSpacey
Writing to file for chelseahandler
Writing to file for lindsaylohan
Writing to file for KimKardashian
Writing to file for GwynethPaltrow
Writing to file for _KrisJStewart


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [28]:
from_user_df = get_df_FROM_user_tweets()

In [29]:
from_user_df.head()

Unnamed: 0_level_0,favorite_count,retweet_count,followers_count,rfr,screen_name,screen_name_ind,created_at
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
981304846971097088,0.002392,0.000393,0.16471,0.000328,TheRock,50,2018-04-03 22:58:01
981279790534557696,0.000526,7e-05,0.16471,7.1e-05,TheRock,50,2018-04-03 21:18:28
981279645575168000,0.0,0.000412,0.16471,3.5e-05,TheRock,50,2018-04-03 21:17:53
981276183705759744,0.00803,0.002104,0.16471,0.001168,TheRock,50,2018-04-03 21:04:08
981275604233408512,0.002018,0.00032,0.16471,0.000276,TheRock,50,2018-04-03 21:01:49


### Plotting

In [30]:
ternary_plot_all(from_user_df, 'Celebrities')

In [None]:
from_user_df_ave = from_user_df.groupby('screen_name').mean()
ternary_plot_avg(from_user_df_ave, 'Musicians')

# Tweets ABOUT User

In [12]:
[write_ABOUT_user_tweets_to_file(screen_name) for screen_name in celeb_screen_names]

Writing to file for TheRock
Writing to file for tomhanks
Writing to file for prattprattpratt
Writing to file for LeoDiCaprio
Writing to file for jimmyfallon
Writing to file for HereIsGina
Writing to file for EmmaWatson
Writing to file for AnnaKendrick
Writing to file for TheEllenShow
Writing to file for Oprah
Writing to file for Caitlyn_Jenner
Writing to file for DrOz
Writing to file for piersmorgan
Writing to file for carliesheen
Writing to file for KevinSpacey
Writing to file for chelseahandler
Writing to file for lindsaylohan
Writing to file for KimKardashian
Writing to file for GwynethPaltrow
Writing to file for _KrisJStewart


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [13]:
about_user_df = get_df_ABOUT_user_tweets()

In [14]:
about_user_df.head(10)

Unnamed: 0_level_0,created_at,text,polarity_ind,polarity_agg,favorite_count,retweet_count,screen_name
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
981344943938637826,2018-04-04 01:37:21,RT The busiest man in show business is,0.0,0.0,0,4430,TheRock
981344917959200768,2018-04-04 01:37:15,RT If you had asked me a year ago to guess whi...,-0.133333,-8.666667,0,65,TheRock
981344877962182656,2018-04-04 01:37:06,RT s new movie has a big gorilla but this litt...,0.037216,13.583807,0,365,TheRock
981344834987483137,2018-04-04 01:36:55,RT Hey Have you ever felt this way What do you...,0.0,0.0,0,5,TheRock
981344817950220288,2018-04-04 01:36:51,RT and The Pebble,0.0,0.0,0,7297,TheRock
981344805564436482,2018-04-04 01:36:48,RT and The Pebble,0.0,0.0,0,7297,TheRock
981344706226466817,2018-04-04 01:36:25,RT and The Pebble,0.0,0.0,0,7297,TheRock
981344664975527936,2018-04-04 01:36:15,Thank you Been through highs and lows like eve...,0.5,0.0,0,0,TheRock
981344657190866945,2018-04-04 01:36:13,I hope your ability to talk about your mental ...,-0.130556,-0.0,0,0,TheRock
981344646709370880,2018-04-04 01:36:10,RT Pure Greatness between amp,0.214286,129.642857,0,605,TheRock


# BREAK BREAK BREAK