In [1]:
#importing libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('vader_lexicon')
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Toshiba\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [3]:
#read the data from twitter
df = pd.read_csv('Tweets.csv')
df = df[['tweets','likes','time']]

In [4]:
def text_cleaner(text,num):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub(r"[0-9]", "", newString)
    newString = re.sub(',',' ', newString)
    newString = re.sub('-',' ', newString)
    newString = re.sub('_',' ', newString)
    newString = re.sub('&','and', newString)
    newString = re.sub('[()]', '', newString)
    newString = re.sub(';',' ', newString)
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    newString = re.sub(r"Ä¢", "", newString)
    newString = re.sub(r"¬∑", "", newString)
    newString = re.sub(r"\'", "", newString)
    newString = re.sub(r"\"", "", newString)
    newString = re.sub(r"\n", "", newString)
    newString = re.sub(r"\r", "", newString)
    return newString.strip().lower()

In [5]:
X = []
for t in df['tweets']:
    X.append(text_cleaner(t,0))

In [6]:
#Soting list in the column back again
df['tweets'] = pd.DataFrame(X)
df['tweets'].head()

0    trumanlab discussion of adenovirus vector gene...
1    children with spinal muscular atrophy may expe...
2    ari anderson defies odds  diagnosed with spina...
3    i m raising money for cure for marcel   fighti...
4    scientific understanding of  sma improved grea...
Name: tweets, dtype: object

In [7]:
#Tokenise
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df.tweets))

print(data_words[:4])

[['trumanlab', 'discussion', 'of', 'adenovirus', 'vector', 'gene', 'therapy', 'used', 'in', 'iirc', 'duchenne', 'muscular', 'dystrophy', 'spinal', 'muscular', 'atrophy', 'and', 'rpe', 'retinal', 'disease', 'easy', 'topic', 'to', 'make', 'interesting', 'while', 'teaching', 'basic', 'science', 'and', 'real', 'world', 'impact', 'can', 'even', 'tie', 'in', 'some', 'healthcare', 'and', 'ethics', 'discussions'], ['children', 'with', 'spinal', 'muscular', 'atrophy', 'may', 'experience', 'long', 'lasting', 'benefits', 'from', 'genetherapy', 'with', 'onasemnogene', 'abeparvovec', 'according', 'to', 'new', 'data', 'https', 'co', 'sjyclpqf', 'https', 'co', 'dibqjbltk'], ['ari', 'anderson', 'defies', 'odds', 'diagnosed', 'with', 'spinal', 'muscular', 'atrophy', 'at', 'birth', 'he', 'was', 'given', 'just', 'percent', 'chance', 'of', 'surviving', 'past', 'the', 'age', 'of', 'two', 'https', 'co', 'ohcnlmwb'], ['raising', 'money', 'for', 'cure', 'for', 'marcel', 'fighting', 'spinal', 'muscular', 'atro

In [8]:
import re, nltk, spacy, gensim
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [9]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

['trumanlab discussion adenovirus vector gene therapy use iirc duchenne muscular dystrophy spinal muscular atrophy rpe retinal disease easy topic make interesting teach basic science real world impact can even tie healthcare ethic discussion', 'child spinal muscular atrophy may experience long last benefit genetherapy onasemnogene abeparvovec accord new data https co sjyclpqf https co dibqjbltk']


In [10]:
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import text
from pprint import pprint

In [11]:
add_stop_words = ['https', 'get', 'many', 'have', 'hold', 'sir','need','help','plz','make',
                  'available','pls','spinal','muscular','atrophy','vjyxqgt','fightssma','sma','asr','ayaansh']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
vectorizer = CountVectorizer(analyzer='word',
                             token_pattern='[a-zA-Z0-9]{3,}',
                             stop_words=stop_words# num chars > 3
                            )

In [12]:
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [13]:
word_list = vectorizer.get_feature_names()
count_list = data_vectorized.toarray().sum(axis=0)

In [14]:
print(dict(zip(word_list,count_list)))



In [15]:
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline
# Build LDA Model
fixed_seed = 100
lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',
                                      random_state=np.random.RandomState(fixed_seed),          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=5, n_jobs=-1,
                          random_state=RandomState(MT19937) at 0x25549E80468)


In [16]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(df.tweets))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head().style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.01,0.01,0.97,0.01,0.01,2
Doc1,0.02,0.02,0.02,0.94,0.02,3
Doc2,0.02,0.1,0.86,0.02,0.02,2
Doc3,0.01,0.23,0.01,0.01,0.74,4
Doc4,0.01,0.01,0.95,0.01,0.01,2


In [17]:
df_document_topic=df_document_topic.loc[:,['dominant_topic']]
df_document_topic1= df_document_topic.reset_index()

In [18]:
#df_document_topic1.head(10)

In [19]:
df_topic_distribution = df_document_topic1['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,1,117
1,0,105
2,2,49
3,3,34
4,4,31


In [20]:
# # Visulaiise the topic keywords
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

In [21]:
pyLDAvis.save_html(panel, 'lda.html')

In [22]:
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,life,old,year,india,kid,condition,degenerative,shot,shout,month,dear,sikdxot,fbyisqr,usd,wdfncbat,mam,battle,cviluxu,expensive,dkqwhjkgfe
Topic 1,zolgensma,raise,old,little,extend,treat,save,head,hyderabad,amplify,hospital,rainbow,day,share,drug,fight,evn,life,rare,month
Topic 2,suffer,child,genetic,injection,month,noor,treatment,bikaner,old,infant,rare,require,parent,medical,muscle,birth,lead,fatima,condition,crore
Topic 3,type,child,andamp,exoskeleton,csic,patient,kit,therapy,research,know,new,accurately,identify,price,expert,learn,music,affect,walk,live
Topic 4,disease,therapy,type,cure,treatment,share,reach,muhammedalisma,deserve,time,gene,baby,girl,create,live,heart,post,thousand,effect,butterfly
