In [2]:
# Subject:      NLP - gleaning topics from 2016 presidential debate transcripts 
# Date:         11/14/2019
# Name:         Sami Ahmed
# Worked with:  (solo, mostly consulted with John) 

**notebook objectives: lemmatize corpus, topic modeling second approach: LDA, topic model by a part of speech**

# imports

In [2]:
# NLP related packages
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction import text
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# lemmetizer/nltk
from nltk.stem import WordNetLemmatizer 
import nltk
from nltk.corpus import wordnet



# LDA specific
from gensim import matutils, models
import scipy.sparse

#misc files/text processing
import pickle
import re
import collections

#  pandas  options
import pandas as pd

# Topic model using LDA for only nouns 

In [2]:
# Function to pull out nouns from a string of text

from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [6]:
# Read in the cleaned data, before the CountVectorizer step, but after removing punc, weird characters etc. 

data_clean = pd.read_pickle('data_clean_debate.pkl')
data_clean.columns

Index(['Text'], dtype='object')

In [9]:
data_clean['Text']

0        good evening and thank you we are happy to wel...
1        we want to also extend our warm thanks to milw...
3        welcome senator great to see you and former se...
5                            very good to be here with you
6                                                thank you
7                                      welcome to you both
8        now a word about format there will be two shor...
9        with iowa and new hampshire behind us we are n...
10       well gwen and judy thank you very much for hos...
11                               thank you senator sanders
13             thank you senator sanders secretary clinton
14       im running for president to knock down all the...
15                                          thank you both
17       thank you both and well be right back after a ...
19       and welcome back to this pbs newshour debate d...
20       well to put that in a context judy i think we ...
21       but my question is how big would government be.

In [8]:
# remove empty areas of corpus 
data_clean['Text'] = data_clean[data_clean['Text'] != '']
data_clean['Text'].dropna(inplace=True)

In [None]:
# confirm corups is clean of empty areas 
data_clean['Text']

In [9]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean['Text'].apply(nouns))
data_nouns.head(5)

Unnamed: 0,Text
0,evening thank debate partners facebook convers...
1,thanks radio television friends member station...
2,
3,senator secretary state clinton
4,


In [11]:
# make dict of stop words, add common debate words that came up in initial topics out of LSA 

from sklearn.feature_extraction import text 

stop_words = text.ENGLISH_STOP_WORDS.union(['im','dont','need','want','senator','governor','know',
                                           'come','theyre','youre','going','think','said','thats',
                                           'just','make','did','got','mr','ms','ive'])
debate_list = stop_words

In [1]:
# Document-term matrix for unigrams rebuilt only using the nouns df

cvn = CountVectorizer(stop_words=debate_list,min_df=10, max_df=8.5)
data_cvn = cvn.fit_transform(data_nouns['Text']) 
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index

In [17]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [18]:
# run model with 10 topics as before 
ldan = models.LdaModel(corpus=corpusn, num_topics=10, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.079*"trump" + 0.051*"bush" + 0.045*"immigration" + 0.040*"sanders" + 0.037*"carson" + 0.036*"record" + 0.027*"dr" + 0.023*"issue" + 0.020*"immigrants" + 0.019*"guy"'),
 (1,
  '0.070*"tax" + 0.044*"government" + 0.043*"money" + 0.042*"plan" + 0.037*"taxes" + 0.034*"percent" + 0.029*"economy" + 0.027*"business" + 0.022*"street" + 0.019*"dollars"'),
 (2,
  '0.085*"people" + 0.045*"country" + 0.018*"america" + 0.016*"care" + 0.015*"years" + 0.014*"health" + 0.014*"way" + 0.013*"president" + 0.012*"life" + 0.011*"states"'),
 (3,
  '0.031*"president" + 0.029*"deal" + 0.026*"court" + 0.023*"donald" + 0.022*"states" + 0.022*"hes" + 0.021*"people" + 0.021*"party" + 0.019*"thing" + 0.017*"trump"'),
 (4,
  '0.075*"debate" + 0.057*"candidates" + 0.055*"time" + 0.046*"rubio" + 0.044*"cruz" + 0.025*"questions" + 0.023*"thank" + 0.021*"seconds" + 0.021*"kasich" + 0.019*"candidate"'),
 (5,
  '0.050*"jobs" + 0.025*"years" + 0.024*"people" + 0.023*"things" + 0.022*"country" + 0.018*"state" + 0

# lemmatize corpus 

In [10]:
# load lemmatizer

lemmatizer = WordNetLemmatizer()


In [11]:
# map NLTK’s POS tags to the format wordnet lemmatizer would accept

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [45]:
# tokenizes word and runs the lemmatizer

data_clean_lemmed = data_clean['Text'].apply(lambda x: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)])

In [46]:
# I continue using this dataset in the NMF & Kmeans notebook 

data_clean_lemmed.to_pickle("data_lemmed&tokenized.pkl")