In [1]:
import pandas as pd
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF
import gensim
from gensim.models import LsiModel
from gensim import corpora

In [2]:
tweets = pd.read_csv('tweetdata.csv')

In [4]:
tweets = pd.DataFrame(tweets.text)
tweets['text'] = tweets['text'].str.replace('[','')
tweets['text'] = tweets['text'].str.replace(']','')
tweets['text'] = tweets['text'].str.replace("'",'')
tweets['text'] = tweets['text'].str.replace(" ",'')
tweets['text'] = tweets['text'].str.split(",")
tweets = tweets['text'].values.tolist()

In [7]:
# Create a set of stopwords
stop = set(stopwords.words('english'))
stop.add('amp')


# Create a set of punctuation words 
exclude = set(string.punctuation) 

# This is the function makeing the lemmatization
lemma = WordNetLemmatizer()

# In this function we perform the entire cleaning
def clean(doc):
    stop_free = [i for i in doc if i not in stop]
    punc_free = [ch for ch in stop_free if ch not in exclude]
    normalized = [lemma.lemmatize(word) for word in punc_free]
    return normalized

# This is the clean corpus.
doc_clean_BOW = [clean(doc) for doc in tweets] 
doc_clean = [' '.join(x) for x in doc_clean_BOW]

In [8]:
vect = CountVectorizer().fit(doc_clean)
tf = vect.transform(doc_clean)

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.10, min_df=5,
                                   max_features=5000,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(doc_clean)

In [11]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 10


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [None]:
# Fit the NMF model

nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)


print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

In [None]:
dictionary = gensim.corpora.Dictionary(doc_clean_BOW)

counter = (dictionary.doc2bow(text) for text in doc_clean_BOW)

#LSI
lsi = gensim.models.lsimodel.LsiModel(corpus=counter, id2word=dictionary, num_topics=10)


In [None]:
lsi.show_topics(10,10)

In [None]:
dictionary = gensim.corpora.Dictionary(doc_clean_BOW)

counter = (dictionary.doc2bow(text) for text in doc_clean_BOW)

#LSI
lsi5 = gensim.models.lsimodel.LsiModel(corpus=counter, id2word=dictionary, num_topics=5)



In [None]:
lsi5.show_topics()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                doc_topic_prior=1,
                                learning_method='online',
                                random_state=0)

lda.fit(tf)

print("\nTopics in LDA model:")
tf_feature_names = vect.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Mexico: 
que @epn los por para mxico mexicano muro como una

Clinton's emails: 
hillary potus foundation timkaine email billclinton politico crooked lie liar

African-Americans always vote democrats: 
vote black american reason african want win wont democrat voter

Trump's refusal to release tax return:
tax return release #crookedon #crookeddon vote record like pay #maga

Pro-Trump:
maga neverhillary trumppence crookedhillary trumptrain makeamericagreatagain americafirst teamtrump demexit hexit

Pro-Clinton:
imwithher strongertogether hillaryclinton voteblue clintonkaine uniteblue love join tacotrucksoneverycorner posting


Topics:
https://www.bbc.com/news/election-us-2016-37241284 (Mexico)
https://www.politico.com/magazine/story/2016/09/hillary-clinton-emails-2016-server-state-department-fbi-214307 (Clinton's emails)
https://www.nytimes.com/2016/03/16/opinion/campaign-stops/will-the-democrats-ever-face-an-african-american-revolt.html (African-Americans always vote Democrats)
https://www.bbc.com/news/election-us-2016-36382410 (Trump's refusal to release tax returns)
Pro-Trump tweets
Pro-Clinton tweets