In [48]:
from tweepy import API, Cursor, OAuthHandler, Stream
from tweepy.streaming import StreamListener
import creds
import json
import sys
import re
import sqlite3

class TwitterClient():
    def __init__(self, twitter_user=None):
        self.auth = TwitterAuthenicator().authenticate_twitter_app()
        self.twitter_client = API(self.auth)
        self.twitter_user = twitter_user
       
        self.tweets = []
        
    
    def get_twitter_client_api(self):
        return self.twitter_client

    def get_user_timeline_tweets(self, num_tweets):
        for tweet in Cursor(self.twitter_client.user_timeline, id=self.twitter_user).items(num_tweets):
            self.tweets.append(tweet)
        return self.tweets
        
    
    def get_friend_list(self, num_friends):
        friend_list = []
        for friend in Cursor(self.twitter_client.friends).items(num_friends):
            friend_list.append(friend)
        return friend_list

    def get_woeid_of_trending_tweets(self):
        with open('woeid_trends.json', 'a') as tf:
            json.dump(self.twitter_client.trends_available(), tf)
    


class TwitterAuthenicator():

    def authenticate_twitter_app(self):
        auth = OAuthHandler(creds.CONSUMER_API_KEY, creds.CONSUMER_SECRET)
        auth.set_access_token(creds.ACCESS_TOKEN, creds.ACCESS_TOKEN_SECRET)
        return auth

class TwitterStreamer():
    def __init__(self):
        self.twiiter_authenicator = TwitterAuthenicator()

    def stream_tweets(self, fname, hashtag_list):
        listener = StdOutListener(fname)
        auth = self.twiiter_authenicator.authenticate_twitter_app()
        stream = Stream(auth, listener)
        stream.filter(track=hashtag_list)
        
class StdOutListener(StreamListener):

    def __init__(self, fname):
        self.fname = fname

    def on_data(self, data):
        try:

            print(data)
            with open(fname, 'a') as tf:
                tf.write(data)
            return True
        except BaseException as e:
            print("error on data %s", str(e))

        return True

    def on_error(self, status):
        if status == 420:
            # Return false if we reach rate limit
            return False
        print(status)

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from textblob import TextBlob
import re


class TweetAnalyzer():
    
    def __init__(self, name):
        self.twitter_client = TwitterClient()
        self.api = self.twitter_client.get_twitter_client_api()
        self.name = name
    
    def clean_tweet(self, tweet):
        # remove speical characters and hyperlinks
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
    
    def analyze_sentiment(self, tweet):
        analysis = TextBlob(self.clean_tweet(tweet))
        return analysis.sentiment.polarity
    
    def analyze_subjectivity(self, tweet):
        analysis = TextBlob(self.clean_tweet(tweet))
        return analysis.sentiment.subjectivity
    
    def user_feed_to_database(self):
        tweets = self.api.user_timeline(screen_name=self.name, count=200, tweet_mode="extended")
        df = self.analyse_tweets(tweets)
        self.tweets_to_db(tweets, df)
    
    def tweets_to_df(self,tweets):
        df = pd.DataFrame(data=[self.clean_tweet(tweet.full_text) for tweet in tweets], columns=['tweets'])
        df['raw_tweet'] = np.array([tweet.full_text for tweet in tweets])
        df['id'] = np.array([tweet.id for tweet in tweets])
        df['date_created'] = np.array([tweet.created_at for tweet in tweets])
        df['date_created'] = pd.to_datetime(df['date_created'])
        df['likes'] = np.array([tweet.favorite_count for tweet in tweets])
        df['retweet_count'] = np.array([tweet.retweet_count for tweet in tweets])
        df['len'] = np.array([len(tweet.full_text) for tweet in tweets])
        df['sentiment'] = np.array([self.analyze_sentiment(tweet) for tweet in df['tweets']])
        df['subjectivity'] = np.array([self.analyze_subjectivity(tweet) for tweet in df['tweets']])
        df['tweet_date'] = [d.date() for d in df['date_created']]
        df['tweet_day_of_week'] = df['date_created'].dt.weekday_name
        df['tweet_hour'] = df['date_created'].dt.hour
        df = df[~df.tweets.str.contains("RT")]
        return df
    
    def tweets_to_db(self, tweets, df):
        conn = sqlite3.connect("tweets.db")
        cur = conn.cursor()
        cur.execute("SELECT id from tweets")
        rows = cur.fetchall()
        for idx, row in df.iterrows():
            if(any(row['id'] in i for i in rows)):
                cur.execute("""UPDATE tweets SET likes = ?, retweet_count = ? WHERE id = ?""",(row['likes'], row['retweet_count'], row['id']) )
            else:
                cur.execute(""" INSERT OR IGNORE INTO 
                        tweets(user, user_id, raw_tweet, tweets, id, date_created, likes, retweet_count, len, sentiment, subjectivity, tweet_date, tweet_day_of_week, tweet_hour)
                        values(?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", (tweets[0].user.screen_name , tweets[0].id,row['raw_tweet'] ,row['tweets'], row['id'], row['date_created'].strftime("%d-%b-%Y (%H:%M:%S.%f)"), row['likes'], row['retweet_count'],  row['len'], row['sentiment'], row['subjectivity'], row['tweet_date'], row['tweet_day_of_week'], row['tweet_hour']))
        
        conn.commit()
        conn.close()
        
    
    def analyse_tweets(self, tweets):
        df = self.tweets_to_df(tweets)
        df['av_len_tweet'] = np.mean(df['len'])
        df['av_tweets_per_day'] = df.groupby('tweet_date')['tweets'].count().mean() 
        df['av_sentiment'] = np.mean(df['sentiment'])
        df['av_subjectivity'] = np.mean(df['subjectivity'])
        
        max_idx = df.groupby(['tweets'])['likes'].transform(max) == df['likes'].max()
        df['most_liked_tweet']= df[max_idx]['tweets'][:1]
        
        min_idx = df.groupby(['tweets'])['likes'].transform(max) == df['likes'].min()
        df['least_liked_tweet'] = df[min_idx]['tweets'][:1]
        
        most_retweets_idx = df.groupby(['tweets'])['retweet_count'].transform(max) == df['retweet_count'].max()
        df['most_retweeted_tweet'] = df[most_retweets_idx]['tweets'][:1]
        return df

In [54]:
#write twitter users to db
users = ['elonmusk', 'neiltyson', 'rickygervais', 'realDonaldTrump', 'TheNotoriousMMA']

for user in users:
    ta = TweetAnalyzer(user).user_feed_to_database()

In [32]:
## Test Twitter Analyser.


tc = TwitterClient()
api = tc.get_twitter_client_api()
tweets = api.user_timeline(screen_name="realDonaldTrump", count=200, tweet_mode="extended")
t_a = TweetAnalyzer("realDonaldTrump")

df_test = t_a.analyse_tweets(tweets)



In [34]:
max_idx = df_test.groupby(['tweets'])['likes'].transform(max) == df_test['likes'].max()
df_test['most_liked_tweet_test']= df_test[max_idx]['tweets'][:1]

## Understand replies to tweets

In [None]:
replies=[] 
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)  

try:
    for full_tweets in Cursor(api.user_timeline,screen_name='elonmusk',timeout=999999).items(10):
          for tweet in Cursor(api.search,q='to:elonmusk', since_id=tweets_em[0].id, result_type='recent',timeout=999999).items(1000):
            if hasattr(tweet, 'in_reply_to_status_id_str'):
                  if (tweet.in_reply_to_status_id_str==full_tweets.id_str):
                    replies.append(tweet.text)
except BaseException as e:
    print("Rate Limit: ", e)
    

## get trending hashtags by country
    * US: 23424977
    * UK: 23424975
    * Aus: 23424748
    * Canada: 23424775
    * NZ: 23424916

## Create Wordcloud for twitter user.

In [None]:
from wordcloud import (WordCloud, get_single_color_func, STOPWORDS, ImageColorGenerator)
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
class GroupedColorFunc(object):
    """Create a color function object which assigns DIFFERENT SHADES of
       specified colors to certain words based on the color to words mapping.
       Uses wordcloud.get_single_color_func
       Parameters
       ----------
       color_to_words : dict(str -> list(str))
         A dictionary that maps a color to the list of words.
       default_color : str
         Color that will be assigned to a word that's not a member
         of any value from color_to_words.
    """

    def __init__(self, color_to_words, default_color):
        self.color_func_to_words = [
            (get_single_color_func(color), set(words))
            for (color, words) in color_to_words.items()]

        self.default_color_func = get_single_color_func(default_color)

    def get_color_func(self, word):
        """Returns a single_color_func associated with the word"""
        try:
            color_func = next(
                color_func for (color_func, words) in self.color_func_to_words
                if word in words)
        except StopIteration:
            color_func = self.default_color_func

        return color_func

    def __call__(self, word, **kwargs):
        return self.get_color_func(word)(word, **kwargs)

In [None]:
text = df['tweets'].values

In [None]:
positive_text = [x for x in text if (TextBlob(x).sentiment.polarity > 0)]
negative_text = [x for x in text if (TextBlob(x).sentiment.polarity < 0)]
neutral_text = [x for x in text if (TextBlob(x).sentiment.polarity == 0)]

In [None]:
twitter = np.array(Image.open('./twitter.png'))


In [None]:
stopwords = set(STOPWORDS)
stopwords.add("RT")
stopwords.add("amp")

In [None]:
wc = WordCloud(mask=twitter, width=5000, height=4000,contour_width=0.3 ,contour_color="#1da1f2",background_color='white', stopwords= stopwords).generate(str(text))

In [None]:
color_to_words = {
    "#1da1f2": positive_text,
    "#14171a": negative_text,
    "#657786": neutral_text
}

default_color = "#AAb8C2"

grouped_color_func = GroupedColorFunc(color_to_words, default_color)

In [None]:
wc.recolor(color_func=grouped_color_func)

In [None]:
fig = plt.figure(figsize = (40,30))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

wc.to_file('./data/img/'+ tweets_em[0].user.screen_name + '.png')

# Topic Modelling Section

In [None]:
import nltk; nltk.download('stopwords')
import gensim
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords.extend(['from', ' subject', 're', 'edu', 'use', 'RT'])

In [None]:
tweets = df['tweets'].values.tolist()

In [None]:
def sent_to_words(sentences):
    for sent in sentences:
        yield(simple_preprocess(str(sent), deacc=True))

data_words = list(sent_to_words(tweets))

In [None]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
tweet_no_stopwords = remove_stopwords(data_words)

tweet_bigrams = make_bigrams(tweet_no_stopwords)

nlp = spacy.load('en', disable=['parser', 'ner'])

data_lemmatized = lemmatization(tweet_bigrams)

print(data_lemmatized[:1])

In [None]:
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

In [None]:
id2word[2]

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
import warnings
warnings.filterwarnings('ignore')

lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, 
                                            id2word=id2word,
                                            num_topics = 3,
                                            random_state = 100,
                                            update_every = 1, 
                                            chunksize = 100,
                                            passes = 10,
                                            alpha = 'auto',
                                            per_word_topics = True)

In [None]:
print(lda_model.print_topics())

In [None]:
doc_lda = lda_model[corpus]

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
mallet_path =  './mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=3, id2word=id2word)

In [None]:
print(ldamallet.show_topics(formatted=True))

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start = 2, step = 3):
    c_values= []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        c_values.append(coherence_model.get_coherence())
        
    return model_list, c_values

In [None]:
model_list, c_values = compute_coherence_values(id2word, corpus, data_lemmatized, limit=40)

In [None]:
c_values

In [None]:
limit=40; start=2; step=3;
x = range(start, limit, step)
plt.plot(x, c_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, c_values):
    print('Num Topics = ', m, " C-Score = ", round(cv, 4))

In [None]:
optimal_model = model_list[7]
model_topics = optimal_model.show_topics(formatted=True)

In [None]:
print(optimal_model.print_topics(num_words=2))