# Find EDA

- Stephen W. Thomas

This script does a lot of common EDA (word count, phrase count, etc.) on a given corpus and outputs the results

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import  CountVectorizer
import re
import numpy as np
import scipy.stats as stats
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from bs4 import BeautifulSoup

In [2]:
from sklearn.feature_extraction import text 

stop_words = text.ENGLISH_STOP_WORDS.union(['s', 'rt', 'br'])

stemmer = PorterStemmer()
lemmer = WordNetLemmatizer()

cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}


c_re = re.compile('(%s)' % '|'.join(cList.keys()))


def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)



# Simple preprocessor.
# Note that this function will be called on each document before stop words are 
# removed, before lowercases, and before tokenization. I.e., the raw documents go into this function.
def preprocessor(doc):

    doc = doc.replace(u'’', u"'")
    doc = doc.replace(u'“', u'"')
    doc = doc.replace(u'”', u'"')
    
    # Remove HTML tags
    doc = BeautifulSoup(doc, "lxml").get_text()
    
    # Remove URLs
    doc = re.sub(r'http\S+', '', doc)
    
    # remove URLS like pic.twitter.com/SODA
    doc = re.sub(r'\b\S*(\.com|\.edu|\.net|\.gov|\.ca|\.org)(/\S*)?', '', doc)
    
    # Make strings like "@ DrJoe" become "@DrJoe"
    doc = re.sub(r'(\@)(\s+)(.)', r'\1\3', doc)
    
    # Make strings like "# DrJoe" become "#DrJoe"
    doc = re.sub(r'(#)(\s+)(.)', r'\1\3', doc)
    
    doc = expandContractions(doc)
    
    #spacy_doc = nlp(doc)
    #doc = " ".join([token.lemma_ for token in spacy_doc])
    
    #Lowercase
    doc = doc.lower()
    
    doc = ' '.join([w for w in doc.split() if w not in stop_words])
    
    doc = ' '.join([lemmer.lemmatize(w) for w in doc.split()])
    return doc

def find_counts(data, text_col = "tweet_text", min_df=2, ngrams=(1,1)):   
    
    # Quick and dirty counter of terms and tokens (before we whittle down later)
    results = Counter()
    data_pre = data.apply(preprocessor)
    data_pre.str.split().apply(results.update)
    
    n_docs = data.shape[0]
    n_terms = len(results)
    n_tokens = sum(results.values())
    
    print('Number of documents: {}'.format(n_docs))
    print('Number of word forms (terms): {}'.format(n_terms))
    print('Number of words (tokens): {}'.format(n_tokens))
    print('Mean words per document: {:.1f}'.format(n_tokens / n_docs))
    print('Mean term occurance: {:.1f}'.format(np.mean(list(results.values()))))
    for m in [1, 5, 10, 100]:
        vs = {k:v for (k, v) in results.items() if v <= m}
        print('Number (Pct) of terms occuring <= {}: {} ({:.1f})'.format(m, len(vs), 100*len(vs)/n_terms))
        
    
    # We override the token_pattern in order to keep @signs and #hashtags
    vec = CountVectorizer(      preprocessor=preprocessor,
                                token_pattern = '[a-zA-Z0-9@#]+',
                                stop_words=stop_words,
                                lowercase=True,
                                min_df=min_df,
                                ngram_range=ngrams,
                                max_features=10000)
    
    bow = vec.fit_transform(data)
    vocab = vec.get_feature_names()
    tdm = pd.DataFrame(bow.toarray(), columns=vocab)
        
    
    n_tokens = sum(tdm.sum())
    n_docs = tdm.shape[0]
    phrases = list(tdm.columns)
    counts = pd.DataFrame(data={'Phrase': phrases, 
                                'Characters': [len(x) for x in phrases],
                                'Terms': [x.count(' ')+1 for x in phrases],
                                'Count': tdm.sum(),
                                'Count Pct': tdm.sum() / n_tokens,
                                'Docs': tdm.astype(bool).sum(),
                                'Docs Pct': tdm.astype(bool).sum() / n_docs,
                          })
    
    counts = counts.sort_values(by=['Count'], ascending=False)
    
    print('Top {} words:'.format(num_words_to_print))
    print(counts.head(num_words_to_print))
    print('\nBottom {} words:'.format(num_words_to_print))
    print(counts.tail(num_words_to_print))
    
    
    return tdm, vocab, counts 


In [3]:
num_words_to_print=25

def do_it(file_base, id_col="id", text_col="tweet_text", target_cols=[]):
    df = pd.read_csv('../data/'+file_base+'.csv')
    
    # Drop rows without any text
    df = df.dropna(subset=[text_col])
    
    #print("\nUnigrams")
    tdm, vocab, counts = find_counts(df[text_col], min_df=10, ngrams=(1,3))
    tdm.to_csv('out/'+file_base+'_tdm.csv', index=False)
    counts.to_csv('out/'+file_base+'_counts.csv', index=False)
    
    
    # Now, lets compute the counts and percentages per target level
    for target_col in target_cols:
        tdm_tmp = tdm
        col_name = 'target_'+target_col
        tdm_tmp[col_name] = df[target_col]
        s = tdm_tmp.groupby(by=[col_name]).sum().T
        n_levels=s.shape[1]
        s['Total'] = s.sum(axis=1)
        for i in range(0, n_levels):
            new_col_name = str(s.columns[i]) + ' Pct'
            s[new_col_name] = s.iloc[:,i] / s['Total']
        s.to_csv('out/'+file_base+'_counts_'+target_col+'.csv', index=True)
                

In [4]:
do_it(file_base="JoeBidenTweets", id_col="id", text_col="tweet")

Number of documents: 4694
Number of word forms (terms): 12584
Number of words (tokens): 72235
Mean words per document: 15.4
Mean term occurance: 5.7
Number (Pct) of terms occuring <= 1: 6967 (55.4)
Number (Pct) of terms occuring <= 5: 10549 (83.8)
Number (Pct) of terms occuring <= 10: 11400 (90.6)
Number (Pct) of terms occuring <= 100: 12497 (99.3)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                    Phrase  Characters  Terms  Count  Count Pct  Docs  \
president        president           9      1   1093   0.017811   986   
trump                trump           5      1    981   0.015986   916   
biden                biden           5      1    564   0.009191   550   
american          american           8      1    560   0.009126   520   
donald              donald           6      1    555   0.009044   549   
donald trump  donald trump          12      2    546   0.008898   540   
need                  need           4      1    526   0.008572   468   
country            country           7      1    462   0.007529   450   
vp                      vp           2      1    451   0.007349   450   
today                today           5      1    426   0.006942   416   
day                    day           3      1    418   0.006812   384   
nation              nation           6      1    409   0.006665   394   
people              people           

In [5]:
do_it(file_base="amazon_food_reviews_10", id_col="reviewID", text_col="reviewText", target_cols=['overall'])

Number of documents: 4999
Number of word forms (terms): 32278
Number of words (tokens): 219633
Mean words per document: 43.9
Mean term occurance: 6.8
Number (Pct) of terms occuring <= 1: 20622 (63.9)
Number (Pct) of terms occuring <= 5: 27813 (86.2)
Number (Pct) of terms occuring <= 10: 29511 (91.4)
Number (Pct) of terms occuring <= 100: 31928 (98.9)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
              Phrase  Characters  Terms  Count  Count Pct  Docs  Docs Pct
like            like           4      1   3146   0.015443  1955  0.391078
taste          taste           5      1   2819   0.013838  1889  0.377876
good            good           4      1   2453   0.012041  1785  0.357071
flavor        flavor           6      1   2378   0.011673  1556  0.311262
coffee        coffee           6      1   1987   0.009754   737  0.147429
just            just           4      1   1854   0.009101  1383  0.276655
tea              tea           3      1   1585   0.007780   508  0.101620
product      product           7      1   1492   0.007324  1018  0.203641
great          great           5      1   1463   0.007181  1198  0.239648
really        really           6      1   1301   0.006386   985  0.197039
make            make           4      1   1277   0.006268  1004  0.200840
love            love           4      1   1190   0.005841   945  0.189038
little        little    

In [6]:
do_it(file_base="imdb.small", id_col="id", text_col="en", target_cols=['rating', 'score'])

Number of documents: 10000
Number of word forms (terms): 144168
Number of words (tokens): 1122017
Mean words per document: 112.2
Mean term occurance: 7.8
Number (Pct) of terms occuring <= 1: 92320 (64.0)
Number (Pct) of terms occuring <= 5: 125007 (86.7)
Number (Pct) of terms occuring <= 10: 132626 (92.0)
Number (Pct) of terms occuring <= 100: 142569 (98.9)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
              Phrase  Characters  Terms  Count  Count Pct  Docs  Docs Pct
movie          movie           5      1  19524   0.019180  6383    0.6383
film            film           4      1  18016   0.017699  5796    0.5796
like            like           4      1   8149   0.008006  4671    0.4671
just            just           4      1   6861   0.006740  4128    0.4128
good            good           4      1   5928   0.005824  3769    0.3769
time            time           4      1   5755   0.005654  3840    0.3840
story          story           5      1   4978   0.004890  3135    0.3135
make            make           4      1   4847   0.004762  3328    0.3328
character  character           9      1   4758   0.004674  2927    0.2927
really        really           6      1   4545   0.004465  2953    0.2953
did              did           3      1   4187   0.004113  2805    0.2805
doe              doe           3      1   3894   0.003825  2709    0.2709
scene          scene    

In [7]:
do_it(file_base="kiva_cleaned", id_col="loan_id", text_col="en_clean", target_cols=['status','country', 'gender', 'nonpayment'])

Number of documents: 6802
Number of word forms (terms): 29789
Number of words (tokens): 510920
Mean words per document: 75.1
Mean term occurance: 17.2
Number (Pct) of terms occuring <= 1: 14088 (47.3)
Number (Pct) of terms occuring <= 5: 22758 (76.4)
Number (Pct) of terms occuring <= 10: 25169 (84.5)
Number (Pct) of terms occuring <= 100: 28957 (97.2)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
              Phrase  Characters  Terms  Count  Count Pct  Docs  Docs Pct
business    business           8      1  13074   0.019000  5392  0.792708
loan            loan           4      1  11374   0.016530  6090  0.895325
year            year           4      1   6938   0.010083  4618  0.678918
sell            sell           4      1   5008   0.007278  3460  0.508674
school        school           6      1   4744   0.006894  3018  0.443693
buy              buy           3      1   4426   0.006432  3238  0.476036
children    children           8      1   4270   0.006206  3520  0.517495
old              old           3      1   4129   0.006001  3683  0.541458
family        family           6      1   4114   0.005979  2917  0.428844
child          child           5      1   4068   0.005912  3002  0.441341
group          group           5      1   3856   0.005604  1820  0.267568
year old    year old           8      2   3849   0.005594  3554  0.522493
able            able    

In [8]:
do_it(file_base="obama_tweets", id_col="id", text_col="tweet_text")

Number of documents: 3230
Number of word forms (terms): 6692
Number of words (tokens): 29180
Mean words per document: 9.0
Mean term occurance: 4.4
Number (Pct) of terms occuring <= 1: 4076 (60.9)
Number (Pct) of terms occuring <= 5: 5860 (87.6)
Number (Pct) of terms occuring <= 10: 6223 (93.0)
Number (Pct) of terms occuring <= 100: 6666 (99.6)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                          Phrase  Characters  Terms  Count  Count Pct  Docs  \
president              president           9      1   1271   0.047095  1262   
obama                      obama           5      1   1165   0.043167  1163   
president obama  president obama          15      2   1136   0.042093  1136   
#actonclimate      #actonclimate          13      1    307   0.011375   307   
change                    change           6      1    236   0.008745   230   
american                american           8      1    223   0.008263   222   
watch                      watch           5      1    210   0.007781   210   
climate                  climate           7      1    207   0.007670   199   
@whitehouse          @whitehouse          11      1    201   0.007448   201   
time                        time           4      1    190   0.007040   188   
today                      today           5      1    179   0.006633   179   
climate change    climate change      

In [9]:
do_it(file_base="elonmusk_tweets", id_col="id", text_col="text")

  markup
  markup
  markup


Number of documents: 2819
Number of word forms (terms): 9431
Number of words (tokens): 25109
Mean words per document: 8.9
Mean term occurance: 2.7
Number (Pct) of terms occuring <= 1: 6296 (66.8)
Number (Pct) of terms occuring <= 5: 8669 (91.9)
Number (Pct) of terms occuring <= 10: 9098 (96.5)
Number (Pct) of terms occuring <= 100: 9420 (99.9)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                    Phrase  Characters  Terms  Count  Count Pct  Docs  \
tesla                tesla           5      1    342   0.027297   335   
model                model           5      1    244   0.019475   233   
@spacex            @spacex           7      1    172   0.013728   171   
just                  just           4      1    162   0.012930   161   
@teslamotors  @teslamotors          12      1    159   0.012691   158   
launch              launch           6      1    157   0.012531   145   
car                    car           3      1    150   0.011972   139   
rocket              rocket           6      1    142   0.011334   142   
good                  good           4      1    140   0.011174   139   
like                  like           4      1    120   0.009578   114   
falcon              falcon           6      1    108   0.008620   106   
yes                    yes           3      1    101   0.008061   101   
1                        1           

In [10]:
do_it(file_base="2017_trump_tweets", id_col="id", text_col="tweet")

  markup
  markup
  markup
  markup


Number of documents: 30385
Number of word forms (terms): 57771
Number of words (tokens): 290035
Mean words per document: 9.5
Mean term occurance: 5.0
Number (Pct) of terms occuring <= 1: 39447 (68.3)
Number (Pct) of terms occuring <= 5: 51735 (89.6)
Number (Pct) of terms occuring <= 10: 54232 (93.9)
Number (Pct) of terms occuring <= 100: 57423 (99.4)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                            Phrase  Characters  Terms  Count  Count Pct  Docs  \
@realdonaldtrump  @realdonaldtrump          16      1   8516   0.031999  8448   
trump                        trump           5      1   4550   0.017097  4311   
great                        great           5      1   3852   0.014474  3616   
thanks                      thanks           6      1   2127   0.007992  2115   
thank                        thank           5      1   1959   0.007361  1934   
president                president           9      1   1846   0.006936  1796   
donald                      donald           6      1   1698   0.006380  1669   
just                          just           4      1   1669   0.006271  1627   
obama                        obama           5      1   1458   0.005479  1409   
people                      people           6      1   1339   0.005031  1286   
like                          like           4      1   1241   0.004663  1204   
america       

In [11]:
do_it(file_base="pence_tweets", id_col="id", text_col="tweet_text")

Number of documents: 3339
Number of word forms (terms): 9497
Number of words (tokens): 37374
Mean words per document: 11.2
Mean term occurance: 3.9
Number (Pct) of terms occuring <= 1: 5574 (58.7)
Number (Pct) of terms occuring <= 5: 8291 (87.3)
Number (Pct) of terms occuring <= 10: 8855 (93.2)
Number (Pct) of terms occuring <= 100: 9478 (99.8)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                    Phrase  Characters  Terms  Count  Count Pct  Docs  \
@indiana          @indiana           8      1    519   0.019511   513   
indiana            indiana           7      1    519   0.019511   506   
edc                    edc           3      1    508   0.019098   503   
@indiana edc  @indiana edc          12      2    508   0.019098   503   
hoosier            hoosier           7      1    453   0.017030   444   
state                state           5      1    329   0.012368   314   
today                today           5      1    325   0.012218   324   
@firstladyin  @firstladyin          12      1    320   0.012030   320   
@govpencein    @govpencein          11      1    276   0.010376   276   
new                    new           3      1    274   0.010301   252   
job                    job           3      1    264   0.009925   257   
w                        w           1      1    262   0.009850   253   
#indiana          #indiana           

In [12]:
do_it(file_base="reutersCSV", id_col="pid", text_col="doc.text")

Number of documents: 19779
Number of word forms (terms): 106064
Number of words (tokens): 1631684
Mean words per document: 82.5
Mean term occurance: 15.4
Number (Pct) of terms occuring <= 1: 57107 (53.8)
Number (Pct) of terms occuring <= 5: 86891 (81.9)
Number (Pct) of terms occuring <= 10: 93905 (88.5)
Number (Pct) of terms occuring <= 100: 103810 (97.9)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
            Phrase  Characters  Terms  Count  Count Pct   Docs  Docs Pct
said          said           4      1  53019   0.026081  15375  0.777340
mln            mln           3      1  25716   0.012650   8341  0.421710
dlrs          dlrs           4      1  20760   0.010212   7730  0.390819
reuter      reuter           6      1  19139   0.009415  19076  0.964457
pct            pct           3      1  17213   0.008468   6010  0.303858
1                1           1      1  16849   0.008288   7170  0.362506
v                v           1      1  14830   0.007295   3192  0.161383
year          year           4      1  13658   0.006719   6450  0.326103
000            000           3      1  13427   0.006605   4761  0.240710
company    company           7      1  10984   0.005403   6022  0.304464
2                2           1      1  10415   0.005123   5483  0.277213
billion    billion           7      1  10317   0.005075   3264  0.165024
u                u           1      1

In [13]:
do_it(file_base="JoeBidenTweets", id_col="id", text_col="tweet")

Number of documents: 4694
Number of word forms (terms): 12584
Number of words (tokens): 72235
Mean words per document: 15.4
Mean term occurance: 5.7
Number (Pct) of terms occuring <= 1: 6967 (55.4)
Number (Pct) of terms occuring <= 5: 10549 (83.8)
Number (Pct) of terms occuring <= 10: 11400 (90.6)
Number (Pct) of terms occuring <= 100: 12497 (99.3)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                    Phrase  Characters  Terms  Count  Count Pct  Docs  \
president        president           9      1   1093   0.017811   986   
trump                trump           5      1    981   0.015986   916   
biden                biden           5      1    564   0.009191   550   
american          american           8      1    560   0.009126   520   
donald              donald           6      1    555   0.009044   549   
donald trump  donald trump          12      2    546   0.008898   540   
need                  need           4      1    526   0.008572   468   
country            country           7      1    462   0.007529   450   
vp                      vp           2      1    451   0.007349   450   
today                today           5      1    426   0.006942   416   
day                    day           3      1    418   0.006812   384   
nation              nation           6      1    409   0.006665   394   
people              people           

In [14]:
do_it(file_base="amazon_food_reviews_10", id_col="reviewID", text_col="reviewText", target_cols=['overall'])

Number of documents: 4999
Number of word forms (terms): 32278
Number of words (tokens): 219633
Mean words per document: 43.9
Mean term occurance: 6.8
Number (Pct) of terms occuring <= 1: 20622 (63.9)
Number (Pct) of terms occuring <= 5: 27813 (86.2)
Number (Pct) of terms occuring <= 10: 29511 (91.4)
Number (Pct) of terms occuring <= 100: 31928 (98.9)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
              Phrase  Characters  Terms  Count  Count Pct  Docs  Docs Pct
like            like           4      1   3146   0.015443  1955  0.391078
taste          taste           5      1   2819   0.013838  1889  0.377876
good            good           4      1   2453   0.012041  1785  0.357071
flavor        flavor           6      1   2378   0.011673  1556  0.311262
coffee        coffee           6      1   1987   0.009754   737  0.147429
just            just           4      1   1854   0.009101  1383  0.276655
tea              tea           3      1   1585   0.007780   508  0.101620
product      product           7      1   1492   0.007324  1018  0.203641
great          great           5      1   1463   0.007181  1198  0.239648
really        really           6      1   1301   0.006386   985  0.197039
make            make           4      1   1277   0.006268  1004  0.200840
love            love           4      1   1190   0.005841   945  0.189038
little        little    

In [15]:
do_it(file_base="ISKON_IMB767-XLS-ENG", id_col="ID", text_col="text")

Number of documents: 4637
Number of word forms (terms): 14332
Number of words (tokens): 91674
Mean words per document: 19.8
Mean term occurance: 6.4
Number (Pct) of terms occuring <= 1: 8957 (62.5)
Number (Pct) of terms occuring <= 5: 12367 (86.3)
Number (Pct) of terms occuring <= 10: 13150 (91.8)
Number (Pct) of terms occuring <= 100: 14201 (99.1)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                    Phrase  Characters  Terms  Count  Count Pct  Docs  \
temple              temple           6      1   4692   0.050000  2588   
place                place           5      1   2751   0.029316  2032   
krishna            krishna           7      1   1884   0.020077  1197   
visit                visit           5      1   1725   0.018382  1434   
hare                  hare           4      1   1350   0.014386   459   
good                  good           4      1   1208   0.012873  1009   
bangalore        bangalore           9      1   1024   0.010912   886   
iskcon              iskcon           6      1    824   0.008781   670   
nice                  nice           4      1    800   0.008525   710   
beautiful        beautiful           9      1    779   0.008301   708   
food                  food           4      1    768   0.008184   650   
time                  time           4      1    709   0.007555   589   
lord                  lord           

In [16]:
do_it(file_base="reviews_Grocery_and_Gourmet_Food_5_50000", id_col="reviewID", text_col="reviewText")

Number of documents: 49994
Number of word forms (terms): 152091
Number of words (tokens): 2257681
Mean words per document: 45.2
Mean term occurance: 14.8
Number (Pct) of terms occuring <= 1: 101591 (66.8)
Number (Pct) of terms occuring <= 5: 131962 (86.8)
Number (Pct) of terms occuring <= 10: 138894 (91.3)
Number (Pct) of terms occuring <= 100: 149196 (98.1)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
              Phrase  Characters  Terms  Count  Count Pct   Docs  Docs Pct
like            like           4      1  32445   0.013332  20044  0.400928
taste          taste           5      1  29344   0.012058  19212  0.384286
good            good           4      1  24510   0.010072  17643  0.352902
flavor        flavor           6      1  23837   0.009795  15379  0.307617
just            just           4      1  18716   0.007691  13825  0.276533
coffee        coffee           6      1  18671   0.007672   7024  0.140497
tea              tea           3      1  15787   0.006487   4776  0.095531
great          great           5      1  15199   0.006246  12132  0.242669
product      product           7      1  14536   0.005973   9903  0.198084
make            make           4      1  12771   0.005248   9976  0.199544
love            love           4      1  12119   0.004980   9765  0.195323
really        really           6      1  11950   0.004911   9279  0.185602
use        

In [17]:
do_it(file_base="vaers2", id_col="VAERS_ID", text_col="SYMPTOM_TEXT", target_cols=['DIED'])

  if (await self.run_code(code, result,  async_=asy)):


Number of documents: 75897
Number of word forms (terms): 139306
Number of words (tokens): 2653604
Mean words per document: 35.0
Mean term occurance: 19.0
Number (Pct) of terms occuring <= 1: 85157 (61.1)
Number (Pct) of terms occuring <= 5: 119316 (85.7)
Number (Pct) of terms occuring <= 10: 127157 (91.3)
Number (Pct) of terms occuring <= 100: 136693 (98.1)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                  Phrase  Characters  Terms  Count  Count Pct   Docs  Docs Pct
pt                    pt           2      1  39184   0.009299  16767  0.220918
vaccine          vaccine           7      1  38370   0.009106  23539  0.310144
patient          patient           7      1  37727   0.008954  14754  0.194395
received        received           8      1  36182   0.008587  23416  0.308523
arm                  arm           3      1  25812   0.006126  17464  0.230101
information  information          11      1  24000   0.005696  12506  0.164776
day                  day           3      1  23232   0.005514  17099  0.225292
reported        reported           8      1  20709   0.004915  12869  0.169559
left                left           4      1  20622   0.004894  15154  0.199665
injection      injection           9      1  20318   0.004822  14333  0.188848
2                      2           1      1  20087   0.004767  14837  0.195489
site                site           4  

In [18]:
do_it(file_base="Hillary_Emails", id_col="Id", text_col="ExtractedBodyText")

  markup
  markup


Number of documents: 6742
Number of word forms (terms): 52659
Number of words (tokens): 334681
Mean words per document: 49.6
Mean term occurance: 6.4
Number (Pct) of terms occuring <= 1: 30458 (57.8)
Number (Pct) of terms occuring <= 5: 44698 (84.9)
Number (Pct) of terms occuring <= 10: 47886 (90.9)
Number (Pct) of terms occuring <= 100: 52154 (99.0)


  ' Beautiful Soup.' % self._decode_markup(markup)


Top 25 words:
                Phrase  Characters  Terms  Count  Count Pct  Docs  Docs Pct
state            state           5      1   2272   0.007100   628  0.093147
pm                  pm           2      1   2119   0.006622   713  0.105755
secretary    secretary           9      1   1387   0.004334   483  0.071640
1                    1           1      1   1318   0.004119   525  0.077870
u                    u           1      1   1254   0.003919   456  0.067636
obama            obama           5      1   1244   0.003887   208  0.030851
said              said           4      1   1233   0.003853   336  0.049837
time              time           4      1   1228   0.003837   590  0.087511
president    president           9      1   1149   0.003590   347  0.051468
office          office           6      1   1140   0.003562   335  0.049689
new                new           3      1   1096   0.003425   420  0.062296
w                    w           1      1    971   0.003034   474  0.07030