# Find EDA

- Stephen W. Thomas

This script finds all named entities in a given corpus and outputs the results.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import re
import numpy as np
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

In [2]:
n_features = 1000
n_components = 75
n_top_words = 20

from sklearn.feature_extraction import text 

stop_words = text.ENGLISH_STOP_WORDS.union(['s', 'rt', 'br'])

lemmer = WordNetLemmatizer()

cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "i'd": "i would",
  "i'd've": "i would have",
  "i'll": "i will",
  "i'll've": "i will have",
  "i'm": "i am",
  "i've": "i have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}


c_re = re.compile('(%s)' % '|'.join(cList.keys()))


def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)



# Simple preprocessor.
# Note that this function will be called on each document before stop words are 
# removed, before lowercases, and before tokenization. I.e., the raw documents go into this function.
def preprocessor(doc):

    doc = doc.replace(u'’', u"'")
    doc = doc.replace(u'“', u'"')
    doc = doc.replace(u'”', u'"')
    
    # Remove HTML tags
    doc = BeautifulSoup(doc, "lxml").get_text()
    
    # Remove URLs
    doc = re.sub(r'http\S+', '', doc)
    
    # remove URLS like pic.twitter.com/SODA
    doc = re.sub(r'\b\S*(\.com|\.edu|\.net|\.gov|\.ca|\.org)(/\S*)?', '', doc)
    
    # Make strings like "@ DrJoe" become "@DrJoe"
    doc = re.sub(r'(\@)(\s+)(.)', r'\1\3', doc)
    
    # Make strings like "# DrJoe" become "#DrJoe"
    doc = re.sub(r'(#)(\s+)(.)', r'\1\3', doc)
    


    #spacy_doc = nlp(doc)
    #doc = " ".join([token.lemma_ for token in spacy_doc])
    
    #Lowercase
    doc = doc.lower()
    
    doc = expandContractions(doc)
    
    doc = ' '.join([w for w in doc.split() if w not in stop_words])
    
    doc = ' '.join([lemmer.lemmatize(w) for w in doc.split()])
    return doc




In [3]:
doc = "We're months into this pandemic and still don't have proper testing, PPE, or clear nationwide guidance. Instead, Donald Trump: - Pushes dangerous, disproven drugs - Stands in the way of the CDC - Refuses to wear a mask He is failing even the most basic test of leadership."
preprocessor(doc)

'month pandemic proper testing, ppe, clear nationwide guidance. instead, donald trump: - push dangerous, disproven drug - stand way cdc - refuse wear mask failing basic test leadership.'

In [4]:
from scipy.stats import entropy


def get_top_words(H, W, feature_names):
    output = []
    for topic_idx, topic in enumerate(H):
        top_words = [(feature_names[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]
        
        # Metrics
        #print(W[:,topic_idx])
        support = np.count_nonzero(np.round(W[:,topic_idx], 3))
        #print(support)
        weight = W[:,topic_idx].sum()
        avg_nonzero = weight/support
        
        output.append([str(topic_idx)] + [support, weight, avg_nonzero] + top_words)
        
    colnames = ["Topic ID", "Support", "Weight", "Avg"] +  ["Word "+str(i) for i in range(0, n_top_words)]
    return pd.DataFrame(output, columns=colnames).sort_values(by=['Support'], ascending=False)

def print_top_docs(topic_idx, W_df, data):
    print(topic_idx)
    top_doc_indices = np.argsort( W_df.iloc[:,topic_idx] )[::-1]
    for doc_index in top_doc_indices[0:5]:
        print(data.iloc[doc_index])

In [5]:
from sklearn.preprocessing import normalize

def find_topics(data_samples, text_col = "tweet_text", include_text=True):   
    output = []
      
    # We override the token_pattern in order to keep @signs and #hashtags
    tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=10,
                                   token_pattern = '[a-zA-Z0-9@#]+',
                                   ngram_range={1,3},
                                   stop_words=stop_words,
                                   preprocessor=preprocessor,
                                   max_features=n_features)

    tfidf = tfidf_vectorizer.fit_transform(data_samples)
    
    nmf = NMF(n_components=n_components, random_state=1, init='nndsvda', solver='mu', alpha=.1, l1_ratio=.5)
    
    W = nmf.fit_transform(tfidf)
    H = nmf.components_
    
    # Normalize the W matrix
    #row_sums = W.sum(axis=1)
    #W = W / row_sums[:, np.newaxis]
    W = normalize(W, axis=1, norm='l1')
    
    top_words = get_top_words(H, W, tfidf_vectorizer.get_feature_names())
    print(top_words.head())
    
    W_df = pd.DataFrame(W, columns=["topic {}".format(i) for i in range(n_components)])
    H_df = pd.DataFrame(H, columns=tfidf_vectorizer.get_feature_names())
    
    return W_df, H_df, top_words   


In [6]:
def do_it(file_base, id_col="id", text_col="tweet_text"):
    df = pd.read_csv('../data/'+file_base+'.csv')
    
    # Drop rows without any text
    df = df.dropna(subset=[text_col])
    
    W_df, H_df, top_words = find_topics(df[text_col])

    #print(top_words)
    #for i in range(n_components):
    #    print_top_docs(i, W_df, df[text_col])

    pd.concat([df, W_df], axis=1).to_csv('out/'+file_base+'_W.csv', float_format='%.4f', index=False)
    H_df.to_csv('out/'+file_base+'_H.csv', float_format='%.4f', index=True)
    top_words.to_csv('out/'+file_base+'_top_words.csv', index=False)

In [7]:
do_it(file_base="JoeBidenTweets", id_col="id", text_col="tweet")

  ' Beautiful Soup.' % self._decode_markup(markup)


  Topic ID  Support      Weight       Avg     Word 0          Word 1  \
3        3     1007  114.993659  0.114194  president  need president   
2        2      903  145.507214  0.161138     donald    donald trump   
7        7      867  116.262536  0.134098   american          people   
8        8      801   89.852129  0.112175    climate          change   
1        1      737  171.367251  0.232520         vp        vp biden   

            Word 2          Word 3     Word 4   Word 5  ...  \
3               mr    mr president       lead     term  ...   
2            trump          nation        doe   defeat  ...   
7  american people  people deserve     crisis  deserve  ...   
8   climate change          action     threat  address  ...   
1            biden            ohio  yesterday    night  ...   

               Word 10             Word 11         Word 12   Word 13  \
3    running president             running          crisis  american   
2  defeat donald trump                lead  

In [8]:
do_it(file_base="imdb.small", id_col="id", text_col="en")

  ' Beautiful Soup.' % self._decode_markup(markup)


   Topic ID  Support      Weight       Avg Word 0      Word 1     Word 2  \
0         0     9559  704.769497  0.073728   make      little        end   
1         1     6285  398.904223  0.063469  movie      movies       make   
2         2     5654  356.764726  0.063100   film       films  film seen   
14       14     4985  212.009649  0.042530   like        look  look like   
24       24     4133  199.159908  0.048188   just  movie just  film just   

        Word 3       Word 4     Word 5  ...    Word 10     Word 11   Word 12  \
0     director         come      point  ...       long       right     world   
1   movie like          saw  recommend  ...     acting  movie just  horrible   
2    film like  film making     cinema  ...  recommend         saw  film doe   
14  movie like   like movie  feel like  ...  just like        feel     sound   
24       plain    just like      going  ...       crap    horrible      need   

     Word 13 Word 14   Word 15 Word 16       Word 17  Word 18 

In [9]:
do_it(file_base="amazon_food_reviews_10", id_col="reviewID", text_col="reviewText")

  ' Beautiful Soup.' % self._decode_markup(markup)


   Topic ID  Support      Weight       Avg   Word 0         Word 1  \
18       18     2010  143.104704  0.071196     like     taste like   
0         0     1902  117.038650  0.061535  product  great product   
13       13     1860  131.868792  0.070897    taste     taste like   
8         8     1846  137.548255  0.074512     good     taste good   
14       14     1690  129.017084  0.076341   flavor            try   

          Word 2       Word 3        Word 4       Word 5  ...     Word 10  \
18   really like       people           lot        think  ...        know   
0   good product          way         happy      company  ...        used   
13    taste good  taste great           doe    doe taste  ...  good taste   
8         pretty  pretty good   really good   good taste  ...       tasty   
14          mild  good flavor  great flavor  flavor good  ...      strong   

          Word 11      Word 12      Word 13     Word 14       Word 15  \
18            try     probably  flavor like

In [10]:
do_it(file_base="obama_tweets", id_col="id", text_col="tweet_text")

  ' Beautiful Soup.' % self._decode_markup(markup)


   Topic ID  Support      Weight       Avg         Word 0           Word 1  \
0         0     1319  207.450842  0.157279          obama  president obama   
15       15      458   94.848911  0.207094  #actonclimate            clean   
1         1      442   96.899283  0.219229         change          climate   
17       17      384   70.210363  0.182839       american          million   
8         8      361   78.917437  0.218608            job          private   

              Word 2           Word 3                        Word 4  \
0          president           nation                        future   
15            energy            power                          plan   
1     climate change    #actonclimate  climate change #actonclimate   
17  million american  american worker                        worker   
8             sector   private sector                        growth   

                  Word 5  ...    Word 10          Word 11  \
0                federal  ...  americans   

In [11]:
do_it(file_base="kiva_cleaned", id_col="loan_id", text_col="en_clean")

  ' Beautiful Soup.' % self._decode_markup(markup)


   Topic ID  Support      Weight       Avg    Word 0          Word 1  \
0         0     5147  268.206925  0.052109  business            year   
37       37     3014  145.498578  0.048274    school         primary   
17       17     2944  151.974144  0.051622     small  small business   
35       35     2616  108.472440  0.041465    income        increase   
39       39     2513  231.857754  0.092263       100             200   

             Word 2            Word 3         Word 4            Word 5  ...  \
0               old          year old           grow           product  ...   
37   primary school             child      secondary  secondary school  ...   
17            start  business selling       business              life  ...   
35  increase income            family  family income        supplement  ...   
39       requesting   requesting loan           lack         transport  ...   

        Word 10               Word 11        Word 12  Word 13         Word 14  \
0      cust

In [12]:
do_it(file_base="elonmusk_tweets", id_col="id", text_col="text")

  ' Beautiful Soup.' % self._decode_markup(markup)
  markup
  markup
  markup


   Topic ID  Support      Weight       Avg  Word 0       Word 1    Word 2  \
0         0      409  110.146960  0.269308   tesla  tesla model   vehicle   
2         2      299   66.574882  0.222658  launch         live  tomorrow   
6         6      285   74.675183  0.262018   model  tesla model    review   
15       15      267   58.644813  0.219643  rocket            w    flight   
13       13      241   62.322000  0.258598     car     best car  electric   

                Word 3       Word 4      Word 5  ...          Word 10  \
0               charge        motor       owner  ...               la   
2                   et      weather        cape  ...           window   
6   @teslamotors model      model x     model 3  ...           safety   
15              engine  grasshopper      thrust  ...  falcon 9 rocket   
13           important        owner  production  ...            drive   

        Word 11        Word 12    Word 13         Word 14     Word 15  \
0   tesla owner  tesla mo

In [13]:
do_it(file_base="2017_trump_tweets", id_col="id", text_col="tweet")

  ' Beautiful Soup.' % self._decode_markup(markup)
  markup
  markup
  markup
  markup


   Topic ID  Support       Weight       Avg            Word 0  \
1         1     9309  1740.069211  0.186923  @realdonaldtrump   
12       12     4886   803.363908  0.164422             obama   
4         4     4562   977.496570  0.214269             great   
3         3     4295   836.511643  0.194764            donald   
23       23     4195   687.396804  0.163861      @barackobama   

                    Word 1       Word 2                   Word 3  \
1                      yes      awesome                     wait   
12         president obama    obamacare                     iran   
4   @realdonaldtrump great    great job                      guy   
3             donald trump        trump  @realdonaldtrump donald   
23                    cont  @mittromney                 campaign   

                    Word 4                Word 5  ...             Word 10  \
1   @realdonaldtrump great  #celebrityapprentice  ...  #trumpforpresident   
12                election                  ca

In [14]:
do_it(file_base="pence_tweets", id_col="id", text_col="tweet_text")

  ' Beautiful Soup.' % self._decode_markup(markup)


  Topic ID  Support      Weight       Avg        Word 0       Word 1  \
0        0      649  146.779251  0.226162      @indiana          edc   
3        3      591  108.506072  0.183597           new          job   
1        1      554  107.555009  0.194143       indiana  edc indiana   
4        4      501   99.913060  0.199427       hoosier  000 hoosier   
2        2      451   96.989106  0.215053  @firstladyin          #in   

                 Word 2       Word 3            Word 4             Word 5  \
0          @indiana edc     #indiana  #astatethatworks  #inregionalcities   
3               new job     creating              plan          announced   
1  @indiana edc indiana  agriculture           central      indiana state   
4           hoosier job          000     hoosier state          workforce   
2                  lady        #flcf           talking          christmas   

   ...             Word 10 Word 11       Word 12               Word 13  \
0  ...               #tech   #

In [15]:
do_it(file_base="reutersCSV", id_col="pid", text_col="doc.text")

  ' Beautiful Soup.' % self._decode_markup(markup)


   Topic ID  Support      Weight       Avg    Word 0        Word 1    Word 2  \
1         1    10351  544.916602  0.052644   company  company said      said   
12       12     7653  391.135532  0.051109         1             2       1 2   
31       31     7298  500.927863  0.068639  mln dlrs           mln      dlrs   
11       11     7210  410.434907  0.056926       pct          year     5 pct   
4         4     6635  397.973836  0.059981     trade         japan  japanese   

           Word 3  Word 4        Word 5  ...     Word 10  Word 11     Word 12  \
1     said reuter    plan  said company  ...    continue  expects  management   
12            2 1     1 1       1 2 pct  ...       1 mln      1 3           7   
31  mln dlrs year   4 mln   dlrs reuter  ...  5 mln dlrs    1 mln   dlrs year   
11       pct year    rose         3 pct  ...       7 pct    1 pct      growth   
4            said  tariff         world  ...      market   export     surplus   

       Word 13       Word 14    

In [16]:
do_it(file_base="ISKON_IMB767-XLS-ENG", id_col="ID", text_col="text")

  ' Beautiful Soup.' % self._decode_markup(markup)


   Topic ID  Support      Weight       Avg  Word 0        Word 1  \
15       15     2486  186.897064  0.075180  temple  visit temple   
0         0     2357  232.600945  0.098685    main   main temple   
11       11     1914  163.995300  0.085682   place    good place   
45       45     1257  112.293919  0.089335   visit  visit temple   
32       32     1014  112.239750  0.110690    food    restaurant   

             Word 2          Word 3        Word 4       Word 5  ...  \
15      temple good  krishna temple        modern          big  ...   
0            temple             way          hall         shop  ...   
11  beautiful place  peaceful place      pleasant  visit place  ...   
45     temple visit     visit place  visit iskcon   good visit  ...   
32            taste           tasty       variety   vegetarian  ...   

         Word 10           Word 11        Word 12           Word 13  \
15    big temple  beautiful temple  modern temple  temple bangalore   
0           exit      

In [17]:
do_it(file_base="reviews_Grocery_and_Gourmet_Food_5_50000", id_col="reviewID", text_col="reviewText")

  ' Beautiful Soup.' % self._decode_markup(markup)


   Topic ID  Support       Weight       Avg  Word 0        Word 1  \
0         0    25397  1318.337718  0.051909    like    taste like   
9         9    21257  1150.331839  0.054115       1             2   
67       67    20937   970.074548  0.046333     try       decided   
12       12    20194  1230.732804  0.060945  flavor  great flavor   
10       10    20068  1283.634622  0.063964   taste    taste like   

         Word 2       Word 3       Word 4       Word 5  ... Word 10  \
0           doe  really like    just like   like taste  ...   stuff   
9           fat            3            5      calorie  ...     1 2   
67    different      thought         year  decided try  ...  wanted   
12  good flavor       strong      texture  flavor good  ...     doe   
10   taste good          doe  taste great    doe taste  ...     bad   

        Word 11      Word 12   Word 13       Word 14      Word 15 Word 16  \
0   like coffee    feel like  probably          feel  tasted like    food   
9   

In [18]:
do_it(file_base="vaers2", id_col="VAERS_ID", text_col="SYMPTOM_TEXT")

  if (await self.run_code(code, result,  async_=asy)):
  ' Beautiful Soup.' % self._decode_markup(markup)


   Topic ID  Support       Weight       Avg    Word 0             Word 1  \
0         0    29952  1934.085882  0.064573   patient  patient developed   
2         2    28012  1481.919058  0.052903       day              2 day   
31       31    24246  1301.990087  0.053699  benadryl            treated   
13       13    22031  1667.537814  0.075691         b            subject   
6         6    21919  1852.834846  0.084531        pt       pt developed   

         Word 2            Word 3       Word 4       Word 5  ...    Word 10  \
0      reported  patient received     received  information  ...  physician   
2         later         day later  day vaccine      vaccine  ...  following   
31           po                er    urticaria        hives  ...        ice   
13      engerix         engerix b    b vaccine    hepatitis  ...   received   
6   pt received         developed     reported     received  ...  physician   

           Word 11           Word 12   Word 13    Word 14     Word 1

In [19]:
do_it(file_base="Hillary_Emails", id_col="Id", text_col="ExtractedBodyText")

  ' Beautiful Soup.' % self._decode_markup(markup)
  markup
  markup


   Topic ID  Support      Weight       Avg   Word 0  Word 1     Word 2  \
3         3     2608  531.334202  0.203732     said   obama  president   
2         2     1690  208.505882  0.123376        h    2009         pm   
8         8     1410  168.203102  0.119293       pm  office         00   
58       58      869  107.421964  0.123616  hillary    2010   february   
0         0      815  513.325876  0.629848      fyi      fw      issue   

       Word 3     Word 4            Word 5  ...   Word 10  Word 11 Word 12  \
3         new      state            people  ...    policy       mr   house   
2        pm h         fw            sunday  ...        12  tuesday       3   
8   secretary         30  secretary office  ...      room    30 pm  arrive   
58    october          7           january  ...  november        6    2012   
0        news  discussed             pm fw  ...   meeting  article    note   

   Word 13  Word 14   Word 15   Word 16    Word 17       Word 18  \
3    party  foreig