# Advanced: Text Processing in Matrices

## Load Natural Language Toolkit for Parsing

In [1]:
! pip install nltk
import nltk

# Enter 'd' for Download, then 'punkt', and then 'q' for quit
nltk.download()


NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt
    Downloading package punkt to /home/jovyan/nltk_data...
      Package punkt is already up-to-date!

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

## Import text files into dictionary

As a "corpus" we fetched some data from Wikipedia, based on currently
trendy (2/18/2017) topics.  Each topic had multiple interpretations, some of which 
we suspected would "intersect" in interesting ways (e.g., Trump/Putin, Cloud/Google, 
Cloud/Climate).  Others had various interpretations (e.g., there are many types of 
Football).  See _Wikipedia.ipynb_ for the original download code.

Selected topics (for which the top-10 matches were returned by Wikipedia) were:

 * Pennsylvania
 * Trump
 * Apple
 * Google
 * Farm
 * Climate
 * Cloud
 * Football
 * Government
 * Putin

*docs* is a map from file --> text

In [4]:
import os

docs = {}

for filename in os.listdir('text'):
    if filename == '.ipynb_checkpoints':
        continue
    file = open('text/' + filename)
    docs[filename] = file.read()
    print ('Loaded',filename)

print ("All files loaded")

Loaded Cloud computing.txt
Loaded United States farm bill.txt
Loaded Mediterranean climate.txt
Loaded Oort cloud.txt
Loaded Trump family.txt
Loaded Eric Trump.txt
Loaded Animal Farm.txt
Loaded Government of Malaysia.txt
Loaded Apple I.txt
Loaded Wind farm.txt
Loaded Alpine climate.txt
Loaded Putin khuilo!.txt
Loaded Trump University.txt
Loaded Donald Trump.txt
Loaded Apple.txt
Loaded Legal affairs of Donald Trump.txt
Loaded Climate.txt
Loaded Google Account.txt
Loaded Flag football.txt
Loaded Farm.txt
Loaded History of Pennsylvania.txt
Loaded Google Talk.txt
Loaded Apple Corps.txt
Loaded Apple III.txt
Loaded Football in England.txt
Loaded Climate model.txt
Loaded Arrest of Vladimir Putin viral video.txt
Loaded Government of the United Kingdom.txt
Loaded Putin Must Go.txt
Loaded Apple Inc..txt
Loaded Calumet Farm.txt
Loaded Cumulus cloud.txt
Loaded Arcus cloud.txt
Loaded Vladimir Putin.txt
Loaded Google Developers.txt
Loaded Government in exile.txt
Loaded Province of Pennsylvania.txt
Lo

## Other preliminaries to get you started.

The function *has_letter* should be used to filter words based on the presence of a letter.

The set *stopwords* includes words to ignore.

In [37]:
import nltk
from nltk.stem.porter import *
import re
import numpy as np

"""
# Returns True if the input (string) parameter has
# any sort of letter in it, else returns False.
"""
def has_letter(x):
    return re.match('.*[a-zA-Z].*',x) != None

# Stopwords are words we will ignore for search
# purposes, because they are too common to be useful
stopwords = set()

stop_file = open('stopwords.txt')
for line in stop_file:
    stopwords.add(line.strip())

# The NLTK parser breaks apostrophe-s into a separate "word"
# so we'll want to add it to the list... Though it's technically
# not a stop word in the traditional sense.
stopwords.add("'s")

# Use this as the maximum number of words we will index
MAX_WORDS = 18102

# Create the word stemmer
stemmer = PorterStemmer()

# Your Code Goes Here!

Note that you may want to read more about TF*IDF scoring at:

* http://nlp.stanford.edu/IR-book/html/htmledition/term-frequency-and-weighting-1.html
* https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [38]:
#######################################################################
################         Create the doc_vector function   #############
#######################################################################

def doc_vector(content, vector, lexicon, inverse_lexicon, stopwords, word_count):
    test = nltk.word_tokenize(content)
    #test = [stemmer.stem(t) for t in test]
    for word in test:
        
        word = word.lower()
        
        if has_letter(word):
            if word not in stopwords:

                word = stemmer.stem(word) 
                
                if word in lexicon.keys():
                    vector[lexicon[word]] += 1
                    continue
                
                if word_count >= MAX_WORDS:
                    continue
                    
                #add to lexicon here
                lexicon[word] = word_count
                inv_lexicon[word_count] = word
                vector[word_count] += 1
                word_count += 1
                
                
        #here you create word frequency
        #if word in lexicon.keys():
         #   vector[lexicon[word]] += 1
          #  corpus[doc_index][lexicon[word]] = 1
    return word_count

In [39]:
#######################################################################
################           Call the doc_vector function   #############
#######################################################################


matrix = np.zeros((len(docs),MAX_WORDS))

lexicon = {}
inv_lexicon = {}

word_count = 0
doc_index = 0

for doc in docs:
    word_count = doc_vector(docs[doc], matrix[doc_index,:], lexicon, inv_lexicon, stopwords, word_count)
    #print(matrix[doc_index])
    doc_index += 1
    
idf = np.log10(len(docs)/np.count_nonzero(matrix,axis=0))
idf

array([ 0.6946052 ,  0.61542395,  0.3723859 , ...,  1.99563519,
        1.99563519,  1.99563519])

In [40]:
def create_query_vector(query):
    bs = 0
    return_vector = np.zeros(MAX_WORDS)
    #return_vector, bs = doc_vector(query, return_vector, lexicon, inv_lexicon, stopwords, index)
    doc_vector(query, return_vector, lexicon, inv_lexicon, stopwords, word_count)
    return return_vector

In [57]:
import csv
import pandas as pd
import numpy as np
import re
from numpy.linalg import norm



#doc_name = []
#for doc in docs:
#    doc_name.append(doc)
    
    
def search(vectors, idf, query, num_results):
    doc_name = list(docs.keys())
    
    final_df = pd.DataFrame(columns=['docid','docname','score'])
    
    for row in range(vectors.shape[0]):
        new_matrix = vectors[row,:]*idf
        new_query = query*idf
        result = np.dot(new_matrix,new_query)/(norm(new_matrix)*norm(new_query))

        final_df.loc[row] = [int(row),doc_name[row],result]
        
        #result_df.columns = ["docid", "score"]
        #names_df = pd.DataFrame(doc_name).reset_index()
        #names_df.columns = ["docid", "docname"]

        #final_df = result_df.merge(names_df, left_on=['docid'], right_on=['docid'])
        #final_df = final_df[['docid','docname','score']]

        final_df = final_df.sort_values(by='score', ascending=False)
        final_df = final_df[:num_results]
    return final_df





"""
        new_matrix = np.multiply(vectors,idf)
        new_query = np.multiply(query,idf)
        result = np.dot(new_matrix,new_query)/(norm(new_matrix)*norm(new_query))

        result_df = pd.DataFrame(result).reset_index()
        result_df.columns = ["docid", "score"]
        names_df = pd.DataFrame(doc_name).reset_index()
        names_df.columns = ["docid", "docname"]

        final_df = result_df.merge(names_df, left_on=['docid'], right_on=['docid'])
        final_df = final_df[['docid','docname','score']]

        final_df = final_df.sort_values(by='score', ascending=False)
        final_df = final_df[:num_results]
"""

'\n        new_matrix = np.multiply(vectors,idf)\n        new_query = np.multiply(query,idf)\n        result = np.dot(new_matrix,new_query)/(norm(new_matrix)*norm(new_query))\n\n        result_df = pd.DataFrame(result).reset_index()\n        result_df.columns = ["docid", "score"]\n        names_df = pd.DataFrame(doc_name).reset_index()\n        names_df.columns = ["docid", "docname"]\n\n        final_df = result_df.merge(names_df, left_on=[\'docid\'], right_on=[\'docid\'])\n        final_df = final_df[[\'docid\',\'docname\',\'score\']]\n\n        final_df = final_df.sort_values(by=\'score\', ascending=False)\n        final_df = final_df[:num_results]\n'

## Step 5

In [58]:
search_vector = create_query_vector('Apple Steve jobs')
df = search(matrix,idf,search_vector,10)
df

Unnamed: 0,docid,docname,score
29,29.0,Apple Inc..txt,0.490086
8,8.0,Apple I.txt,0.442409
23,23.0,Apple III.txt,0.401514
87,87.0,Apple II series.txt,0.349863
88,88.0,Apple Store.txt,0.342035
14,14.0,Apple.txt,0.330999
81,81.0,Cooking apple.txt,0.305086
59,59.0,Apple TV.txt,0.300357
22,22.0,Apple Corps.txt,0.27613
45,45.0,Home Farm F.C..txt,0.019465


In [59]:
search_vector = create_query_vector('Trump Putin')
df = search(matrix,idf,search_vector,5)
df

Unnamed: 0,docid,docname,score
13,13.0,Donald Trump.txt,0.661491
15,15.0,Legal affairs of Donald Trump.txt,0.636261
65,65.0,The Trump Organization.txt,0.627995
12,12.0,Trump University.txt,0.593636
66,66.0,Public image of Vladimir Putin.txt,0.591413


In [60]:
search_vector = create_query_vector('Google Cloud')
df = search(matrix,idf,search_vector,5)
df

Unnamed: 0,docid,docname,score
62,62.0,Google.txt,0.641453
34,34.0,Google Developers.txt,0.509939
17,17.0,Google Account.txt,0.504314
0,0.0,Cloud computing.txt,0.494979
21,21.0,Google Talk.txt,0.473183
