In [1]:
import pandas as pd
import json

tweet_texts = []
with open('English_tweets.json', 'r') as jsonfile:
        tweets = json.load(jsonfile)
for tweet in tweets:
    tweet_texts.append(tweet['tweet_text'])

In [2]:
tweet_texts[0:10]

['RT @AC360: You said "what punishment of gods are not gifts. Do you really believe that?" @andersoncooper, choking back tears, asks Stephen…',
 '@carlarhoffmann @AC360 thank you!',
 'one of my favorite excerpts from my interview with @StephenAtHome. This question he embraces left me stunned when i… https://t.co/YV2s8z34Jk',
 'Come see me in #Winnipeg on Sunday, Sept 15 for a fun evening of conversation, sharing personal stories, as well as… https://t.co/fW9m3RKzKa',
 'RT @AC360: Megan Rapinoe says kneeling during the National Anthem was difficult, but not disrespectful, adding that she is hopeful there wi…',
 'excited to interview #MeganRapinoe live tonight on @AC360 8pm. what questions would you ask her? https://t.co/dq1N2ypzu1',
 'Thank you for all the lovely messages about my mom. I was lucky to have her for so long. She was the coolest lady i… https://t.co/hluZ1EJlgO',
 'Thanks for the cover @EW! https://t.co/bk5dYEggMx',
 'A great new book called #WeAreEverywhere  just came out. 

In [3]:
df = pd.DataFrame(tweet_texts)

In [4]:
df.columns = ['tweet_text']

In [5]:
import string, re
import textblob as tb

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    res = []
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = tb.TextBlob(text)
    
    for token, tag in text.pos_tags:
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        token = token.lemmatize(pos)
        res.append(token)

    return ' '.join(res)

df_clean = pd.DataFrame(df.tweet_text.apply(lambda x: clean_text(x)))

In [6]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=3,                        # minimum required occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=5000,             # max number of unique words. Build a vocabulary that only consider the top max_features ordered by term frequency across the corpus
                            )

data_vectorized = vectorizer.fit_transform(df_clean['tweet_text'])

lda_model = LatentDirichletAllocation(n_components=10, # Number of topics
                                      learning_method='online',
                                      random_state=0,       
                                      n_jobs = -1  # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

In [7]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [8]:
import numpy as np

# Show top 20 keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,make,work,watch,good,look,talk,know,join,like,health,care,day,address,forward,just,plan,congratulation,new,start,biden
Topic 1,right,day,people,let,stand,act,just,read,end,long,love,power,ask,continue,want,stop,gun,congress,violence,attack
Topic 2,president,people,say,obama,time,trump,american,family,today,way,state,think,leader,speak,happy,tell,america,community,thing,want
Topic 3,year,com,vote,win,election,realdonaldtrump,kamalaharris,brasil,trump,student,public,leave,south,city,remember,govt,poll,voter,china,republican
Topic 4,que,thank,para,need,presidente,fight,temer,michel,mais,ncia,aovivo,governo,uma,dont,young,donald,hoje,life,matter,estamos
Topic 5,change,live,news,life,don,climate,believe,share,report,bring,hope,justice,lose,late,potus,school,medium,work,est,rio
Topic 6,india,woman,job,joebiden,nation,tulsigabbard,house,democrat,debate,hear,book,hold,pass,number,chance,like,ready,pradesh,glad,write
Topic 7,country,narendramodi,shashitharoor,bjp,arvindkejriwal,best,meet,congress,proud,indian,government,today,delhi,visit,incindia,united,modi,add,break,speech
Topic 8,amp,support,big,party,economy,great,meeting,run,important,week,yesterday,history,lead,meetthepress,deal,miss,good,war,candidate,case
Topic 9,great,help,today,campaign,pay,thanks,wish,friend,million,child,issue,team,strong,high,future,joe,tax,record,serve,know


In [9]:
Topics_theme = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df_topic_keywords['topic_theme'] = Topics_theme

In [10]:
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,...,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19,topic_theme
Topic 0,make,work,watch,good,look,talk,know,join,like,health,...,day,address,forward,just,plan,congratulation,new,start,biden,a
Topic 1,right,day,people,let,stand,act,just,read,end,long,...,power,ask,continue,want,stop,gun,congress,violence,attack,b
Topic 2,president,people,say,obama,time,trump,american,family,today,way,...,think,leader,speak,happy,tell,america,community,thing,want,c
Topic 3,year,com,vote,win,election,realdonaldtrump,kamalaharris,brasil,trump,student,...,leave,south,city,remember,govt,poll,voter,china,republican,d
Topic 4,que,thank,para,need,presidente,fight,temer,michel,mais,ncia,...,governo,uma,dont,young,donald,hoje,life,matter,estamos,e
Topic 5,change,live,news,life,don,climate,believe,share,report,bring,...,justice,lose,late,potus,school,medium,work,est,rio,f
Topic 6,india,woman,job,joebiden,nation,tulsigabbard,house,democrat,debate,hear,...,hold,pass,number,chance,like,ready,pradesh,glad,write,g
Topic 7,country,narendramodi,shashitharoor,bjp,arvindkejriwal,best,meet,congress,proud,indian,...,today,delhi,visit,incindia,united,modi,add,break,speech,h
Topic 8,amp,support,big,party,economy,great,meeting,run,important,week,...,history,lead,meetthepress,deal,miss,good,war,candidate,case,i
Topic 9,great,help,today,campaign,pay,thanks,wish,friend,million,child,...,team,strong,high,future,joe,tax,record,serve,know,j


In [11]:
df_topic_keywords.set_index('topic_theme', inplace=True)

In [12]:
df_topic_keywords.T

topic_theme,a,b,c,d,e,f,g,h,i,j
Word 0,make,right,president,year,que,change,india,country,amp,great
Word 1,work,day,people,com,thank,live,woman,narendramodi,support,help
Word 2,watch,people,say,vote,para,news,job,shashitharoor,big,today
Word 3,good,let,obama,win,need,life,joebiden,bjp,party,campaign
Word 4,look,stand,time,election,presidente,don,nation,arvindkejriwal,economy,pay
Word 5,talk,act,trump,realdonaldtrump,fight,climate,tulsigabbard,best,great,thanks
Word 6,know,just,american,kamalaharris,temer,believe,house,meet,meeting,wish
Word 7,join,read,family,brasil,michel,share,democrat,congress,run,friend
Word 8,like,end,today,trump,mais,report,debate,proud,important,million
Word 9,health,long,way,student,ncia,bring,hear,indian,week,child


In [13]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)

# column names
topicnames = df_topic_keywords.T.columns
# topicnames = ["Topic" + str(i) for i in range(20)]

# index names
docnames = ["Tweet" + str(i) for i in range(len(df_clean))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [14]:
df_document_topic

topic_theme,a,b,c,d,e,f,g,h,i,j,dominant_topic
Tweet0,0.01,0.12,0.11,0.01,0.01,0.60,0.01,0.01,0.11,0.01,5
Tweet1,0.05,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05,0.05,4
Tweet2,0.01,0.16,0.01,0.01,0.01,0.01,0.73,0.01,0.01,0.01,6
Tweet3,0.54,0.01,0.01,0.01,0.01,0.13,0.27,0.01,0.01,0.01,0
Tweet4,0.02,0.02,0.68,0.02,0.02,0.02,0.02,0.18,0.02,0.02,2
Tweet5,0.01,0.30,0.01,0.01,0.01,0.30,0.30,0.01,0.01,0.01,1
Tweet6,0.01,0.12,0.01,0.01,0.68,0.01,0.12,0.01,0.01,0.01,4
Tweet7,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.37,0.03,0.37,7
Tweet8,0.38,0.01,0.01,0.01,0.01,0.01,0.17,0.01,0.37,0.01,0
Tweet9,0.01,0.01,0.01,0.01,0.01,0.14,0.01,0.01,0.01,0.76,9


In [15]:
df_document_topic.reset_index(inplace=True)
df_sent_topic= pd.merge(df_clean, df_document_topic, left_index=True, right_index=True)
df_sent_topic.drop('index', axis=1, inplace=True)

In [16]:
df_sent_topic

Unnamed: 0,tweet_text,a,b,c,d,e,f,g,h,i,j,dominant_topic
0,rt you say what punishment of god be not gift ...,0.01,0.12,0.11,0.01,0.01,0.60,0.01,0.01,0.11,0.01,5
1,carlarhoffmann thank you,0.05,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05,0.05,4
2,one of my favorite excerpt from my interview w...,0.01,0.16,0.01,0.01,0.01,0.01,0.73,0.01,0.01,0.01,6
3,come see me in winnipeg on sunday sept for a f...,0.54,0.01,0.01,0.01,0.01,0.13,0.27,0.01,0.01,0.01,0
4,rt megan rapinoe say kneeling during the natio...,0.02,0.02,0.68,0.02,0.02,0.02,0.02,0.18,0.02,0.02,2
5,excite to interview meganrapinoe live tonight ...,0.01,0.30,0.01,0.01,0.01,0.30,0.30,0.01,0.01,0.01,1
6,thank you for all the lovely message about my ...,0.01,0.12,0.01,0.01,0.68,0.01,0.12,0.01,0.01,0.01,4
7,thanks for the cover ew,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.37,0.03,0.37,7
8,a great new book call weareeverywhere just com...,0.38,0.01,0.01,0.01,0.01,0.01,0.17,0.01,0.37,0.01,0
9,rt today mark bradford be one of america ’ s m...,0.01,0.01,0.01,0.01,0.01,0.14,0.01,0.01,0.01,0.76,9


In [17]:
df_topic_theme = df_sent_topic[['tweet_text', 'dominant_topic']]

In [18]:
df_topic_theme.head(10)

Unnamed: 0,tweet_text,dominant_topic
0,rt you say what punishment of god be not gift ...,5
1,carlarhoffmann thank you,4
2,one of my favorite excerpt from my interview w...,6
3,come see me in winnipeg on sunday sept for a f...,0
4,rt megan rapinoe say kneeling during the natio...,2
5,excite to interview meganrapinoe live tonight ...,1
6,thank you for all the lovely message about my ...,4
7,thanks for the cover ew,7
8,a great new book call weareeverywhere just com...,0
9,rt today mark bradford be one of america ’ s m...,9


In [28]:
i = 0
for tweet in tweets:
    tweet['topic'] = str(df_topic_theme.dominant_topic[i])
    i += 1

In [30]:
with open('English_tweets_with_topic.json', 'w') as f:
    json.dump(tweets, f)