In [27]:
from sklearn.feature_extraction.text import CountVectorizer
import os

os.chdir('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/wiki-IDF')

import numpy as np
import pandas as pd
import nltk
#nltk.download('punkt')
#nltk.download('maxent_treebank_pos_tagger')
#nltk.download('averaged_perceptron_tagger')

stemmer = nltk.stem.snowball.EnglishStemmer()

wikiPath = './wiki-i15-30k300-test'


## WHERE I GOT STUFF

#### the custom function
http://slendermeans.org/ml4h-ch4.html

#### the documentation for CountVectorizer
http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html


In [28]:
def sklearn_tdm_df(docs, **kwargs):
    '''
    Create a term-document matrix (TDM) in the form of a pandas DataFrame
    Uses sklearn's CountVectorizer function.

    Parameters
    ----------
    docs: a sequence of documents (files, filenames, or the content) to be
        included in the TDM. See the `input` argument to CountVectorizer.
    **kwargs: keyword arguments for CountVectorizer options.

    Returns
    -------
    tdm_df: A pandas DataFrame with the term-document matrix. Columns are terms,
        rows are documents.
    '''
    # Initialize the vectorizer and get term counts in each document.
    vectorizer = CountVectorizer(**kwargs)
    word_counts = vectorizer.fit_transform(docs)

    # .vocabulary_ is a Dict whose keys are the terms in the documents,
    # and whose entries are the columns in the matrix returned by fit_transform()
    vocab = vectorizer.vocabulary_

    # Make a dictionary of Series for each term; convert to DataFrame
    count_dict = {w: pd.Series(word_counts.getcol(vocab[w]).data) for w in vocab}
    tdm_df = pd.DataFrame(count_dict).fillna(0)
    #return tdm_df
    return count_dict

In [29]:
# Call the function on e-mail messages. The token_pattern is set so that terms are only
# words with two or more letters (no numbers or punctuation)

# message_tdm = sklearn_tdm_df(train_df['message'],
#                             stop_words = 'english',
#                             charset_error = 'ignore',
#                             token_pattern = '[a-zA-Z]{2,}')

In [30]:
wikiFiles = os.listdir('./'+wikiPath)
print(wikiFiles[:5])

['-Weird_Al-_Yankovic.txt', '.DS_Store', '2004_Indian_Ocean_earthquake.txt', '2005_Atlantic_hurricane_season.txt', 'Abkhazia.txt']


## the non-tokenized version

In [31]:
#VEC = CountVectorizer(input='filename')
#os.chdir('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/wiki-IDF/wiki-i15-30k300-test')
#tdm = VEC.fit_transform(wikiFiles)
#tdm.shape

## THE REAL THING

In [32]:
def tokeNstem(files):
    import re
    import string
    import time
    #
    print('%%%%%%\nRUNNING tokeNstem\n%%%%%%')
    punctuation = set(string.punctuation)
    start=time.time()
    tokens = {}
    count = 0
    totalFiles = len(files)
    #
    for fileName in files:
        # increment count for updates
        count += 1
        if (count % 200 == 0):
            print('$$$$ FINISHED ' + str(count) + ' of ' + str(totalFiles) + ' docs in ' + str(time.time()-start) + ' seconds')
        #Extract raw text and update for encoding issues            
        rawData=unicode(open(fileName).read(), "utf-8", errors="ignore")
        textList=nltk.word_tokenize(rawData)
        tokenList=[]
        for token in textList:
            try:
                tokenList.append(str(token))
            except:
                tokenList.append('**CODEC_ERROR**')
        
        #Convert all text to lower case
        textList=[word.lower() for word in tokenList]

        #Remove punctuation
        punctuation = set(string.punctuation)
        textList=[word for word in textList if word not in punctuation]
        textList=["".join(c for c in word if c not in punctuation) for word in textList ]

        #convert digits into NUM
        textList=[re.sub("\d+", "NUM", word) for word in textList]  

        #Stem words
        textList=[stemmer.stem(word) for word in textList]

        #Remove blanks
        textList=[word for word in textList if word!= ' ']
            
        #Extract tokens
        tokens[fileName]=textList
    #
    end=time.time()
    print('*** finished with ' + str(len(tokens.keys())) + ' documents in ' + str(end-start) + ' seconds')
    #
    return tokens

In [41]:
os.chdir('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/wiki-IDF/wiki-i15-30k300-test')
testTokens = tokeNstem(wikiFiles)

$$$$ FINISHED 200 of 836 docs in 2.34880590439 seconds
$$$$ FINISHED 400 of 836 docs in 4.67841696739 seconds
$$$$ FINISHED 600 of 836 docs in 6.8709859848 seconds
$$$$ FINISHED 800 of 836 docs in 9.11950707436 seconds
*** finished with 836 documents in 9.83084511757 seconds


In [42]:
#testTokens['Bulgaria.txt']

#### got this below from
http://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer/38986703

In [43]:
#VEC = CountVectorizer(input='filename')

VEC = CountVectorizer(
      # so we can pass it strings
      input='content',
      # turn off preprocessing of strings to avoid corrupting our keys
      lowercase=False,
      preprocessor=lambda x: x,
      # use our token dictionary
      tokenizer=lambda key: testTokens[key])



In [44]:
#tdm = VEC.fit_transform(wikiFiles)
tdm = VEC.fit_transform(testTokens.keys())

In [45]:
tdm.shape

(836, 18148)

In [46]:
vocab = VEC.vocabulary_
# Make a dictionary of Series for each term;
count_dict = {w: pd.Series(tdm.getcol(vocab[w]).data) for w in vocab}
count_dict.keys()

['',
 u'ioann',
 u'starokonstantinov',
 u'fawk',
 u'\x00\x00\x00\x00\x00\x00\x01',
 u'canalnum',
 u'mps',
 u'foun',
 u'yellow',
 u'interchang',
 u'four',
 u'fortythird',
 u'realmnum',
 u'thirst',
 u'seifer',
 u'francesca',
 u'melodrama',
 u'cyprus',
 u'repetit',
 u'skillnum',
 u'lenca',
 u'nonitalian',
 u'accret',
 u'sunlik',
 u'lorn',
 u'discnum',
 u'crossbar',
 u'lord',
 u'incnum',
 u'kwashiorkor',
 u'olentangi',
 u'digit',
 u'kemet',
 u'saskatchewan',
 u'trojan',
 u'figh',
 u'bratislava',
 u'gravitinum',
 u'lumin',
 u'delv',
 u'fur',
 u'ironi',
 u'elvi',
 u'disturb',
 u'thannum',
 u'prize',
 u'fevernum',
 u'wooden',
 u'carthag',
 u'predecessornum',
 u'wednesday',
 u'elementari',
 u'solid',
 u'penguinnum',
 u'sakartvelo',
 u'ninetyfoot',
 u'wwwmillenniumaznummcaccounthtm',
 u'hussit',
 u'brantford',
 u'religiouslymotiv',
 u'charter',
 u'mission',
 u'sturm',
 u'thrace',
 u'popularis',
 u'dramasom',
 u'miller',
 u'bacon',
 u'shorthaul',
 u'histor',
 u'second',
 u'summer',
 u'politicale

In [50]:
#####
def makeFreqDict(count_dict):
    import string
    import time
    #
    print('%%%%%%\nRUNNING makeFreqDict\n%%%%%%')
    start=time.time()
    freqDict = {}
    count = 0
    totalTerms = len(count_dict.keys())
    #
    for key in count_dict.keys():
        # increment count for updates
        count += 1
        if (count % 1000 == 0):
            print('$$$$ FINISHED ' + str(count) + ' of ' + str(totalTerms) + ' docs in ' + str(time.time()-start) + ' seconds')
        freqDict[key] = len(count_dict[key])
    #
    end=time.time()
    print('*** finished with ' + str(totalTerms) + ' terms in ' + str(end-start) + ' seconds')
    #
    return freqDict

freqDict = makeFreqDict(count_dict)

## THE OLD THING
#freqDict = {}
#for key in count_dict.keys():
#    freqDict[key] = len(count_dict[key])
#
#freqDict ## I double checked this by searching a few keys in the finder and it looked good

%%%%%%
RUNNING makeFreqDict
%%%%%%
$$$$ FINISHED 1000 of 18148 docs in 0.00609612464905 seconds
$$$$ FINISHED 2000 of 18148 docs in 0.0132689476013 seconds
$$$$ FINISHED 3000 of 18148 docs in 0.0187590122223 seconds
$$$$ FINISHED 4000 of 18148 docs in 0.0247349739075 seconds
$$$$ FINISHED 5000 of 18148 docs in 0.0304040908813 seconds
$$$$ FINISHED 6000 of 18148 docs in 0.0353710651398 seconds
$$$$ FINISHED 7000 of 18148 docs in 0.0410079956055 seconds
$$$$ FINISHED 8000 of 18148 docs in 0.0462470054626 seconds
$$$$ FINISHED 9000 of 18148 docs in 0.0522980690002 seconds
$$$$ FINISHED 10000 of 18148 docs in 0.0581500530243 seconds
$$$$ FINISHED 11000 of 18148 docs in 0.0644819736481 seconds
$$$$ FINISHED 12000 of 18148 docs in 0.0715520381927 seconds
$$$$ FINISHED 13000 of 18148 docs in 0.0765740871429 seconds
$$$$ FINISHED 14000 of 18148 docs in 0.0815050601959 seconds
$$$$ FINISHED 15000 of 18148 docs in 0.0866451263428 seconds
$$$$ FINISHED 16000 of 18148 docs in 0.0906760692596 secon

In [14]:
countDF = pd.DataFrame(freqDict.items(), columns=['term', 'freq'])
countDF = countDF.set_index('term')
countDF.head()

Unnamed: 0_level_0,freq
term,Unnamed: 1_level_1
ioann,1
conapo,1
fawk,1
longer-run,1
pendulum,1


In [15]:
print(countDF[countDF['freq'] > 1].shape)
DF = countDF[countDF['freq'] > 1]

(6325, 1)


In [16]:
DF.shape

(6325, 1)

In [17]:
M = len(wikiFiles)
DF['idf'] = M / DF['freq']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [18]:
import math

#### use this approach and hard code the log base is you want something other than e
def getLog(num):
    return math.log(num, 2) # put in log base here (where the 2 is)

#countDF['logidf'] = countDF['idf'].apply(getLog) 

#### or just use this if you just want natural log
DF['logidf'] = DF['idf'].apply(math.log) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
DF.sort_values(by='idf').head(20)
#idf.sort_values(by='idf', ascending=False).head(10)

Unnamed: 0_level_0,freq,idf,logidf
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
of,834,1.0,0.0
",",834,1.0,0.0
linknum:num,834,1.0,0.0
the,834,1.0,0.0
.,834,1.0,0.0
:,834,1.0,0.0
and,833,1.0012,0.0012
in,831,1.00361,0.003604
a,815,1.023313,0.023045
to,804,1.037313,0.036634


In [21]:
DF.sort_values(by='idf', ascending=False).head(20)

Unnamed: 0_level_0,freq,idf,logidf
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reliabl,2,417.0,6.033086
bactria,2,417.0,6.033086
katrina,2,417.0,6.033086
fourth-most,2,417.0,6.033086
fiji,2,417.0,6.033086
methan,2,417.0,6.033086
stalinist,2,417.0,6.033086
vladimir,2,417.0,6.033086
multi-lay,2,417.0,6.033086
robbin,2,417.0,6.033086


In [23]:
DF.to_csv('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/wiki-IDF/wiki-IDF-test.csv', encoding='utf-8')

In [27]:
wps = wikiPath.split('-')
wps[len(wps)-1]

'test'

In [30]:
DF.shape[0]

6325