In [0]:
# Pip install what's not in Colab
pip install python-dotenv


Collecting python-dotenv
  Downloading https://files.pythonhosted.org/packages/cb/2a/07f87440444fdf2c5870a710b6770d766a1c7df9c827b0c90e807f1fb4c5/python_dotenv-0.13.0-py2.py3-none-any.whl
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.13.0


In [0]:
# Imports
from dotenv import load_dotenv
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from spacy.tokenizer import Tokenizer

import gensim
import json
import numpy as np
import os
import pandas as pd
import re
import spacy
import tweepy
import tweepy

# Loading ...
load_dotenv()

# Find the secrets
TWITTER_API_KEY = os.getenv("TWITTER_API_KEY")
TWITTER_API_SECRET = os.getenv("TWITTER_API_SECRET")
TWITTER_ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN")
TWITTER_ACCESS_TOKEN_SECRET = os.getenv("TWITTER_ACCESS_TOKEN_SECRET")

# Enable the ability to access the Twitter API
auth = tweepy.OAuthHandler(TWITTER_API_KEY, TWITTER_API_SECRET)
auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True)


In [0]:
# Start with Lambda School as the user
screen_name = 'LambdaSchool'

def get_information(screen_name):
  """
  Calls twitter and retreves tweets

  Returns a DataFrame
  """
    new_tweets = api.user_timeline(screen_name=screen_name, count=200,
                                   tweet_mode='extended')
    tweets = []
    retweet_count = []
    favorite_count = []

    for tweet in range(len(new_tweets)):

        status = new_tweets[tweet]

        # convert to string
        json_str = json.dumps(status._json)

        # deserialise string into python object
        parsed = json.loads(json_str)
        tweets.append(parsed.get('full_text'))
        retweet_count.append(parsed.get('retweet_count'))
        favorite_count.append(parsed.get('retweet_count'))
    # return a dataframe
    return pd.DataFrame(list(zip(tweets, retweet_count, favorite_count)),
                        columns=['tweets', 'retweet_count', 'favorite_count'])

# Call function to get information
df = get_information(screen_name)


In [0]:
# Get the user tweet post where followers most engaged with
df = df.sort_values(by=['retweet_count', 'favorite_count'], ascending=False)


In [0]:
# Clean and see the tweets
df['tweets_clean'] = df['tweets'].apply(lambda x: x[0:-1].replace('\n\n', ' '))
df['tweets_clean']


In [0]:

def clear_emoji(text):
  """
  Clean the emoji characters from the tweets
  """

    emoji_pattern = re.compile("["
          u"\U0001F600-\U0001F64F"
          u"\U0001F300-\U0001F5FF"
          u"\U0001F680-\U0001F6FF"
          u"\U0001F1E0-\U0001F1FF" 
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Call clean_emoji function and apply results to new column
df['tweets_clean'] = df['tweets_clean'].apply(clear_emoji)


In [0]:
# Further cleaning of the tweets for use

df['tweets_hashtag'] = df['tweets_clean'].apply(
    lambda x: [col for col in x.split() if col.startswith('#')])

df['tweets_hashtag'] = df['tweets_hashtag'].apply(
    lambda x: ','.join(map(str, x)))

df['tweets_mention'] = df['tweets_clean'].apply(
    lambda x: [col for col in x.split() if col.startswith('@')])

df['tweets_mention'] = df['tweets_mention'].apply(
    lambda x: ','.join(map(str, x)))

df['tweets'] = df['tweets_clean']

df = df.drop('tweets_clean', axis=1)


In [0]:
# Commence the Natural Language Processing 

nlp = spacy.load("en_core_web_lg")

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

STOP_WORDS = nlp.Defaults.stop_words.union(["&amp", "rt", "lambda", "&amp;",
                                            "i’m", "we're", "you’re", "it’s",
                                            'thanks', 'student', 'school.'])

tokens = []

""" Update those tokens w/o stopwords"""
for doc in tokenizer.pipe(df['tweets'], batch_size=500):

    doc_tokens = []

    for token in doc:
        if (token.text.lower() not in STOP_WORDS) &
         (token.is_punct == False) &
        (token.is_space == False):
            doc_tokens.append(token.text.lower())

    tokens.append(doc_tokens)

df['tokens'] = tokens


In [0]:
id2word = corpora.Dictionary(df['tokens'])

corpus = [id2word.doc2bow(text) for text in df['tokens']]

lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics=15,
                   passes=10,
                   workers=8)

lda.print_topics()

words = [re.findall(r'"([^"]*)"', t[1]) for t in lda.print_topics()]


In [0]:
# Here's the topics
topics = [' '.join(t[0:5]) for t in words]

# Print the topics
for id, t in enumerate(topics):

    print(f"------ Topic {id} ------")
    print(t, end="\n\n")
    