In [1]:
import tweepy
import pandas as pd
import numpy as np
import re

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from authorization_keys import consumer_key, consumer_secret, token, token_key

In [2]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

auth.set_access_token(token, token_key)

In [3]:
api = tweepy.API(auth)

In [4]:
tweets = []
for tweet in tweepy.Cursor(api.search, q='MTBPS2019 -filter:retweets', tweet_mode='extended').items(3000):
    tweets.append(tweet)

In [5]:
len(tweets)

2293

Looks like 2293 is the most number of tweets MTBPS2019 has.

In [19]:
# Just checking how much data is in a tweet
tweets[100]

Status(_api=<tweepy.api.API object at 0x000001D454BDCE80>, _json={'created_at': 'Fri Nov 01 12:35:05 +0000 2019', 'id': 1190245877803339780, 'id_str': '1190245877803339780', 'full_text': 'Sobriety is the main message in the @tito_mboweni MTBPS, with alarming expectations of the debt-GDP ratio reaching 81%, says BLSA’s @BusiMavuso2. #MTBPS2019 See https://t.co/V237kTCKyf', 'truncated': False, 'display_text_range': [0, 184], 'entities': {'hashtags': [{'text': 'MTBPS2019', 'indices': [146, 156]}], 'symbols': [], 'user_mentions': [{'screen_name': 'tito_mboweni', 'name': 'Tito Mboweni', 'id': 855262092, 'id_str': '855262092', 'indices': [36, 49]}, {'screen_name': 'BusiMavuso2', 'name': 'Busi Mavuso', 'id': 4928587906, 'id_str': '4928587906', 'indices': [132, 144]}], 'urls': [{'url': 'https://t.co/V237kTCKyf', 'expanded_url': 'http://ow.ly/PTda30pOHVV', 'display_url': 'ow.ly/PTda30pOHVV', 'indices': [161, 184]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<

In [41]:
tweets[100].entities['user_mentions'][0]['screen_name']

'tito_mboweni'

In [85]:
df = pd.DataFrame()

In [86]:
def clean_tweet(tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet).split())

In [112]:
df['date'] = [twit.created_at for twit in tweets]
df['id'] = [twit.user.name for twit in tweets]
df['text'] = [clean_tweet(twit.full_text) for twit in tweets]
df['target'] = [twit.entities['user_mentions'] if twit.entities['user_mentions'] else None for twit in tweets]

In [128]:
df.head()

Unnamed: 0,date,id,text,target
0,2019-11-07 11:04:45,ADRS,Post MTBPS2019 and the release of the second v...,"[{'screen_name': 'TreasuryRSA', 'name': 'Natio..."
1,2019-11-07 10:25:08,MBI Matseke Business Investment (PTY)LTD,Recap of mtbps2019,
2,2019-11-07 10:09:35,Institute for Economic Justice,Some say austerity will make things worse some...,"[{'screen_name': 'budgetjusticesa', 'name': 'B..."
3,2019-11-07 09:13:42,Equal Education,READ State spending cuts higher taxes doing mo...,
4,2019-11-07 06:40:01,CapeTalk on 567AM,It would be irresponsible if the Reserve Bank ...,


Need to sort out the target column so that it can have only a list of targets

In [129]:
def mentioned_names(target_list):
    m = [] # List of names targeted
    for i in range(len(target_list)):
        m.append(target_list[i]['name'])
    return m

In [131]:
df['target'] = [mentioned_names(x) if x is not None else None for x in df['target']]

In [132]:
df.head()

Unnamed: 0,date,id,text,target
0,2019-11-07 11:04:45,ADRS,Post MTBPS2019 and the release of the second v...,[National Treasury]
1,2019-11-07 10:25:08,MBI Matseke Business Investment (PTY)LTD,Recap of mtbps2019,
2,2019-11-07 10:09:35,Institute for Economic Justice,Some say austerity will make things worse some...,[Budget Justice Coalition SA]
3,2019-11-07 09:13:42,Equal Education,READ State spending cuts higher taxes doing mo...,
4,2019-11-07 06:40:01,CapeTalk on 567AM,It would be irresponsible if the Reserve Bank ...,


In [119]:
df['target'].iloc[100]

[{'screen_name': 'tito_mboweni',
  'name': 'Tito Mboweni',
  'id': 855262092,
  'id_str': '855262092',
  'indices': [36, 49]},
 {'screen_name': 'BusiMavuso2',
  'name': 'Busi Mavuso',
  'id': 4928587906,
  'id_str': '4928587906',
  'indices': [132, 144]}]

In [122]:
df['target'].iloc[100][0]['name']

'Tito Mboweni'

In [None]:
TextBlob(df['text'][0]).sentiment.polarity

In [None]:
def polarity_value(tweet):
    analysis = TextBlob(tweet)
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1

In [None]:
df['sentiment'] = df['text'].apply(lambda x: polarity_value(x))

In [None]:
df.tail()

In [None]:
df.to_csv('MTBPS.csv', index=False)