In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import  CountVectorizer
import re
import numpy as np
import scipy.stats as stats
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Simple preprocessor
def removeURLs(x):
    # Remove URLs
    x = re.sub(r'http\S+', '', x)
    
    #Lowercase
    x = x.lower()
    return x

def find_counts(data, text_col = "tweet_text", min_df=1, ngrams=(1,1)):   
    
    # We override the token_pattern in order to keep @signs and #hashtags
    vec = CountVectorizer(      preprocessor=removeURLs,
                                token_pattern = '[a-zA-Z0-9@#]+',
                                stop_words='english',
                                lowercase=True,
                                min_df=min_df,
                                ngram_range=ngrams,
                                max_features=50000)
    
    bow = vec.fit_transform(data)
    vocab = vec.get_feature_names()
    tdm = pd.DataFrame(bow.toarray(), columns=vocab)
        
    
    n_tokens = sum(tdm.sum())
    n_docs = tdm.shape[0]
    phrases = list(tdm.columns)
    counts = pd.DataFrame(data={'Phrase': phrases, 
                                'Characters': [len(x) for x in phrases],
                                'Terms': [x.count(' ')+1 for x in phrases],
                                'isStopword': [x in stop_words for x in phrases],
                                'Stem': [stemmer.stem(x) for x in phrases],
                                'Count': tdm.sum(),
                                'Count Pct': tdm.sum() / n_tokens,
                                'Docs': tdm.astype(bool).sum(),
                                'Docs Pct': tdm.astype(bool).sum() / n_docs,
                          })
    
    counts = counts.sort_values(by=['Count'], ascending=False)
    
    print('Number of documents: {}'.format(tdm.shape[0]))
    print('Number of word forms (terms): {}'.format(tdm.shape[1]))
    print('Number of words (tokens): {}'.format(sum(counts['Count'])))
    print('Mean words per document: {:.1f}'.format(sum(counts['Count']) / tdm.shape[0]))
    print('Mean term occurance: {:.1f}'.format(np.mean(counts['Count'])))
    print('Number of terms occuring once: {}'.format(sum(counts['Count']==1)))
    print('Number of terms occuring <= 5 times: {}'.format(sum(counts['Count']<=5)))
    print('Number of terms occuring <= 10 times: {}'.format(sum(counts['Count']<=10)))
    print('Top {} words:'.format(num_words_to_print))
    print(counts.head(num_words_to_print))
    print('\nBottom {} words:'.format(num_words_to_print))
    print(counts.tail(num_words_to_print))
    
    
    return tdm, vocab, counts 


In [3]:
num_words_to_print=25

def do_it(file_base, id_col="id", text_col="tweet_text", target_cols=[]):
    df = pd.read_csv('data/'+file_base+'.csv')
    
    # Drop rows without any text
    df = df.dropna(subset=[text_col])
    
    print("\nUnigrams")
    tdm, vocab, counts = find_counts(df[text_col])
    tdm.to_csv('out/'+file_base+'_unigram_tdm.csv', index=False)
    counts.to_csv('out/'+file_base+'_unigram_counts.csv', index=False)
    
    print("\nTrigrams")
    tdm, vocab, counts = find_counts(df[text_col], min_df=10, ngrams=(1,3))
    tdm.to_csv('out/'+file_base+'_ngram_tdm.csv', index=False)
    counts.to_csv('out/'+file_base+'_ngram_counts.csv', index=False)
    
    # Now, lets compute the counts and percentages per target level
    for target_col in target_cols:
        tdm_tmp = tdm
        col_name = 'target_'+target_col
        tdm_tmp[col_name] = df[target_col]
        s = tdm_tmp.groupby(by=[col_name]).sum().T
        n_levels=s.shape[1]
        s['Total'] = s.sum(axis=1)
        for i in range(0, n_levels):
            new_col_name = str(s.columns[i]) + ' Pct'
            s[new_col_name] = s.iloc[:,i] / s['Total']
        #for i in range(0, n_levels):
        #    new_col_name = str(s.columns[i]) + ' Pct'
        #    delta = 0.0001 # To avoid log2 errors
        #    s[str(s.columns[i]) + ' Odds'] = np.log2((delta +s[new_col_name])/ (delta + (1-s[new_col_name])))
        s.to_csv('out/'+file_base+'_counts_'+target_col+'.csv', index=True)
                

In [4]:
do_it(file_base="amazon_food_reviews_10", id_col="reviewID", text_col="reviewText", target_cols=['overall'])


Unigrams
Number of documents: 4999
Number of word forms (terms): 14621
Number of words (tokens): 221753
Mean words per document: 44.4
Mean term occurance: 15.2
Number of terms occuring once: 6524
Number of terms occuring <= 5 times: 10764
Number of terms occuring <= 10 times: 12067
Top 25 words:
              Phrase  Characters  Terms  isStopword     Stem  Count  \
t                  t           1      1        True        t   3207   
s                  s           1      1        True        s   3199   
like            like           4      1       False     like   3057   
good            good           4      1       False     good   2441   
taste          taste           5      1       False     tast   2326   
flavor        flavor           6      1       False   flavor   2085   
coffee        coffee           6      1       False    coffe   1913   
just            just           4      1        True     just   1854   
tea              tea           3      1       False      tea   

In [5]:
do_it(file_base="kiva_cleaned", id_col="loan_id", text_col="en_clean", target_cols=['status','country', 'gender', 'nonpayment'])


Unigrams
Number of documents: 6802
Number of word forms (terms): 17542
Number of words (tokens): 510044
Mean words per document: 75.0
Mean term occurance: 29.1
Number of terms occuring once: 6445
Number of terms occuring <= 5 times: 11889
Number of terms occuring <= 10 times: 13610
Top 25 words:
              Phrase  Characters  Terms  isStopword      Stem  Count  \
business    business           8      1       False      busi  12645   
loan            loan           4      1       False      loan  10648   
children    children           8      1       False  children   7683   
years          years           5      1       False      year   6151   
school        school           6      1       False    school   4647   
old              old           3      1       False       old   4129   
buy              buy           3      1       False       buy   4062   
able            able           4      1       False       abl   3811   
group          group           5      1       False   

In [6]:
do_it(file_base="obama_tweets", id_col="id", text_col="tweet_text")


Unigrams
Number of documents: 3230
Number of word forms (terms): 4145
Number of words (tokens): 29708
Mean words per document: 9.2
Mean term occurance: 7.2
Number of terms occuring once: 2001
Number of terms occuring <= 5 times: 3342
Number of terms occuring <= 10 times: 3675
Top 25 words:
                      Phrase  Characters  Terms  isStopword        Stem  \
president          president           9      1       False      presid   
obama                  obama           5      1       False       obama   
s                          s           1      1        True           s   
rt                        rt           2      1       False          rt   
#actonclimate  #actonclimate          13      1       False  #actonclim   
change                change           6      1       False       chang   
t                          t           1      1        True           t   
watch                  watch           5      1       False       watch   
climate              climate     

In [7]:
do_it(file_base="elonmusk_tweets", id_col="id", text_col="text")


Unigrams
Number of documents: 2819
Number of word forms (terms): 7217
Number of words (tokens): 26120
Mean words per document: 9.3
Mean term occurance: 3.6
Number of terms occuring once: 4148
Number of terms occuring <= 5 times: 6326
Number of terms occuring <= 10 times: 6819
Top 25 words:
                    Phrase  Characters  Terms  isStopword         Stem  Count  \
rt                      rt           2      1       False           rt    492   
s                        s           1      1        True            s    419   
tesla                tesla           5      1       False        tesla    335   
model                model           5      1       False        model    244   
t                        t           1      1        True            t    188   
amp                    amp           3      1       False          amp    185   
@spacex            @spacex           7      1       False      @spacex    172   
just                  just           4      1        True   

In [8]:
do_it(file_base="2017_trump_tweets", id_col="id", text_col="tweet")


Unigrams
Number of documents: 30385
Number of word forms (terms): 31791
Number of words (tokens): 288450
Mean words per document: 9.5
Mean term occurance: 9.1
Number of terms occuring once: 18656
Number of terms occuring <= 5 times: 26539
Number of terms occuring <= 10 times: 28438
Top 25 words:
                            Phrase  Characters  Terms  isStopword  \
@realdonaldtrump  @realdonaldtrump          16      1       False   
s                                s           1      1        True   
trump                        trump           5      1       False   
great                        great           5      1       False   
t                                t           1      1        True   
thanks                      thanks           6      1       False   
thank                        thank           5      1       False   
president                president           9      1       False   
donald                      donald           6      1       False   
just        

In [9]:
do_it(file_base="pence_tweets", id_col="id", text_col="tweet_text")


Unigrams
Number of documents: 3339
Number of word forms (terms): 6988
Number of words (tokens): 40603
Mean words per document: 12.2
Mean term occurance: 5.8
Number of terms occuring once: 3316
Number of terms occuring <= 5 times: 5661
Number of terms occuring <= 10 times: 6285
Top 25 words:
                    Phrase  Characters  Terms  isStopword          Stem  \
rt                      rt           2      1       False            rt   
amp                    amp           3      1       False           amp   
s                        s           1      1        True             s   
@indiana          @indiana           8      1       False      @indiana   
indiana            indiana           7      1       False       indiana   
edc                    edc           3      1       False           edc   
today                today           5      1       False         today   
@firstladyin  @firstladyin          12      1       False  @firstladyin   
state                state      

In [10]:
do_it(file_base="reutersCSV", id_col="pid", text_col="doc.text")


Unigrams
Number of documents: 19779
Number of word forms (terms): 43819
Number of words (tokens): 1754721
Mean words per document: 88.7
Mean term occurance: 40.0
Number of terms occuring once: 16029
Number of terms occuring <= 5 times: 29333
Number of terms occuring <= 10 times: 33429
Top 25 words:
          Phrase  Characters  Terms  isStopword     Stem  Count  Count Pct  \
said        said           4      1       False     said  53019   0.030215   
s              s           1      1        True        s  30166   0.017191   
mln          mln           3      1       False      mln  25716   0.014655   
dlrs        dlrs           4      1       False      dlr  20760   0.011831   
reuter    reuter           6      1       False   reuter  19139   0.010907   
pct          pct           3      1       False      pct  17212   0.009809   
1              1           1      1       False        1  16849   0.009602   
vs            vs           2      1       False       vs  14737   0.008398 

In [11]:
do_it(file_base="JoeBidenTweets", id_col="id", text_col="tweet")


Unigrams
Number of documents: 4694
Number of word forms (terms): 9712
Number of words (tokens): 79351
Mean words per document: 16.9
Mean term occurance: 8.2
Number of terms occuring once: 5180
Number of terms occuring <= 5 times: 7783
Number of terms occuring <= 10 times: 8531
Top 25 words:
              Phrase  Characters  Terms  isStopword      Stem  Count  \
s                  s           1      1        True         s   2211   
twitter      twitter           7      1       False   twitter   1423   
com              com           3      1       False       com   1419   
pic              pic           3      1       False       pic   1261   
president  president           9      1       False    presid   1099   
trump          trump           5      1       False     trump    993   
@                  @           1      1       False         @    813   
#                  #           1      1       False         #    777   
t                  t           1      1        True        

In [12]:
do_it(file_base="amazon_food_reviews_10", id_col="reviewID", text_col="reviewText", target_cols=['overall'])


Unigrams
Number of documents: 4999
Number of word forms (terms): 14621
Number of words (tokens): 221753
Mean words per document: 44.4
Mean term occurance: 15.2
Number of terms occuring once: 6524
Number of terms occuring <= 5 times: 10764
Number of terms occuring <= 10 times: 12067
Top 25 words:
              Phrase  Characters  Terms  isStopword     Stem  Count  \
t                  t           1      1        True        t   3207   
s                  s           1      1        True        s   3199   
like            like           4      1       False     like   3057   
good            good           4      1       False     good   2441   
taste          taste           5      1       False     tast   2326   
flavor        flavor           6      1       False   flavor   2085   
coffee        coffee           6      1       False    coffe   1913   
just            just           4      1        True     just   1854   
tea              tea           3      1       False      tea   

In [13]:
do_it(file_base="ISKON_IMB767-XLS-ENG", id_col="ID", text_col="text")


Unigrams
Number of documents: 4637
Number of word forms (terms): 8433
Number of words (tokens): 89624
Mean words per document: 19.3
Mean term occurance: 10.6
Number of terms occuring once: 4194
Number of terms occuring <= 5 times: 6636
Number of terms occuring <= 10 times: 7296
Top 25 words:
                Phrase  Characters  Terms  isStopword      Stem  Count  \
temple          temple           6      1       False     templ   4410   
place            place           5      1       False     place   2636   
krishna        krishna           7      1       False   krishna   1884   
visit            visit           5      1       False     visit   1711   
hare              hare           4      1       False      hare   1350   
good              good           4      1       False      good   1195   
bangalore    bangalore           9      1       False  bangalor   1024   
iskcon          iskcon           6      1       False    iskcon    822   
nice              nice           4      

In [14]:
do_it(file_base="imdb.small", id_col="id", text_col="en", target_cols=['rating', 'score'])


Unigrams
Number of documents: 10000
Number of word forms (terms): 50000
Number of words (tokens): 1164195
Mean words per document: 116.4
Mean term occurance: 23.3
Number of terms occuring once: 17580
Number of terms occuring <= 5 times: 33642
Number of terms occuring <= 10 times: 39047
Top 25 words:
                Phrase  Characters  Terms  isStopword     Stem  Count  \
br                  br           2      1       False       br  40210   
s                    s           1      1        True        s  25425   
movie            movie           5      1       False     movi  17448   
film              film           4      1       False     film  16063   
t                    t           1      1        True        t  13355   
like              like           4      1       False     like   7993   
just              just           4      1        True     just   6862   
good              good           4      1       False     good   5925   
time              time           4      1

In [15]:
do_it(file_base="reviews_Grocery_and_Gourmet_Food_5_50000", id_col="reviewID", text_col="reviewText")


Unigrams
Number of documents: 49994
Number of word forms (terms): 44433
Number of words (tokens): nan
Mean words per document: nan
Mean term occurance: nan
Number of terms occuring once: 0
Number of terms occuring <= 5 times: 0
Number of terms occuring <= 10 times: 0
Top 25 words:
          Phrase  Characters  Terms  isStopword     Stem  Count  Count Pct  \
#              #           1      1       False        #    NaN        NaN   
#1            #1           2      1       False       #1    NaN        NaN   
#10          #10           3      1       False      #10    NaN        NaN   
#10003    #10003           6      1       False   #10003    NaN        NaN   
#10084    #10084           6      1       False   #10084    NaN        NaN   
#1076      #1076           5      1       False    #1076    NaN        NaN   
#1082      #1082           5      1       False    #1082    NaN        NaN   
#1086      #1086           5      1       False    #1086    NaN        NaN   
#1087      #108

In [16]:
do_it(file_base="vaers2", id_col="VAERS_ID", text_col="SYMPTOM_TEXT", target_cols=['DIED'])

  if (await self.run_code(code, result,  async_=asy)):



Unigrams


MemoryError: Unable to allocate 27.7 GiB for an array with shape (3720774528,) and data type int64

In [None]:
do_it(file_base="Hillary_Emails", id_col="Id", text_col="ExtractedBodyText")