# Inverse indexing, index search, and signal page rank¶

## PART I: Preparing the documents/webpages

In [4]:
import nltk

In [5]:
nltk.download('reuters')
  

[nltk_data] Downloading package reuters to
[nltk_data]     /home/dhanendra/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [6]:
# Load libraries

import pandas as pd
import numpy as np 
import string
import random

import nltk
from nltk.corpus import brown
from nltk.corpus import reuters

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer


from nltk.corpus import reuters

In [7]:
reuters.fileids()

['test/14826',
 'test/14828',
 'test/14829',
 'test/14832',
 'test/14833',
 'test/14839',
 'test/14840',
 'test/14841',
 'test/14842',
 'test/14843',
 'test/14844',
 'test/14849',
 'test/14852',
 'test/14854',
 'test/14858',
 'test/14859',
 'test/14860',
 'test/14861',
 'test/14862',
 'test/14863',
 'test/14865',
 'test/14867',
 'test/14872',
 'test/14873',
 'test/14875',
 'test/14876',
 'test/14877',
 'test/14881',
 'test/14882',
 'test/14885',
 'test/14886',
 'test/14888',
 'test/14890',
 'test/14891',
 'test/14892',
 'test/14899',
 'test/14900',
 'test/14903',
 'test/14904',
 'test/14907',
 'test/14909',
 'test/14911',
 'test/14912',
 'test/14913',
 'test/14918',
 'test/14919',
 'test/14921',
 'test/14922',
 'test/14923',
 'test/14926',
 'test/14928',
 'test/14930',
 'test/14931',
 'test/14932',
 'test/14933',
 'test/14934',
 'test/14941',
 'test/14943',
 'test/14949',
 'test/14951',
 'test/14954',
 'test/14957',
 'test/14958',
 'test/14959',
 'test/14960',
 'test/14962',
 'test/149

In [8]:
#load 10k reuters news documents 
len(reuters.fileids())

10788

In [9]:
#view text from one document 
reuters.raw(fileids=['test/14826'])[0:201]

"ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n  Mounting trade friction between the\n  U.S. And Japan has raised fears among many of Asia's exporting\n  nations that the row could inflict far-reaching"

In [10]:
# remove punctuation from all DOCs 
exclude = set(string.punctuation)
alldocslist = []

for index, i in  enumerate(reuters.fileids()):
    text = reuters.raw(fileids=[i])
    text = ''.join(ch for ch in text if ch not in exclude)
    alldocslist.append(text)
    
print(alldocslist[1])

CHINA DAILY SAYS VERMIN EAT 712 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showed vermin consume between seven and 12 pct of Chinas grain
  stocks the China Daily said
      It also said that each year 1575 mln tonnes or 25 pct of
  Chinas fruit output are left to rot and 21 mln tonnes or up
  to 30 pct of its vegetables The paper blamed the waste on
  inadequate storage and bad preservation methods
      It said the government had launched a national programme to
  reduce waste calling for improved technology in storage and
  preservation and greater production of additives The paper
  gave no further details
  




In [11]:
#tokenize words in all DOCS 
plot_data = [[]] * len(alldocslist)

for doc in alldocslist:
    text = doc
    tokentext = word_tokenize(text)
    plot_data[index].append(tokentext)
    
print(plot_data[0][1])

['CHINA', 'DAILY', 'SAYS', 'VERMIN', 'EAT', '712', 'PCT', 'GRAIN', 'STOCKS', 'A', 'survey', 'of', '19', 'provinces', 'and', 'seven', 'cities', 'showed', 'vermin', 'consume', 'between', 'seven', 'and', '12', 'pct', 'of', 'Chinas', 'grain', 'stocks', 'the', 'China', 'Daily', 'said', 'It', 'also', 'said', 'that', 'each', 'year', '1575', 'mln', 'tonnes', 'or', '25', 'pct', 'of', 'Chinas', 'fruit', 'output', 'are', 'left', 'to', 'rot', 'and', '21', 'mln', 'tonnes', 'or', 'up', 'to', '30', 'pct', 'of', 'its', 'vegetables', 'The', 'paper', 'blamed', 'the', 'waste', 'on', 'inadequate', 'storage', 'and', 'bad', 'preservation', 'methods', 'It', 'said', 'the', 'government', 'had', 'launched', 'a', 'national', 'programme', 'to', 'reduce', 'waste', 'calling', 'for', 'improved', 'technology', 'in', 'storage', 'and', 'preservation', 'and', 'greater', 'production', 'of', 'additives', 'The', 'paper', 'gave', 'no', 'further', 'details']


In [13]:
# Navigation: first index gives all documents, second index gives specific document, third index gives words of that doc
plot_data[0][2][0:10]

['JAPAN',
 'TO',
 'REVISE',
 'LONGTERM',
 'ENERGY',
 'DEMAND',
 'DOWNWARDS',
 'The',
 'Ministry',
 'of']

In [15]:

#make all words lower case for all docs 
for x in range(len(reuters.fileids())):
    lowers = [word.lower() for word in plot_data[0][x]]
    plot_data[0][x] = lowers

plot_data[0][2][0:10]

['japan',
 'to',
 'revise',
 'longterm',
 'energy',
 'demand',
 'downwards',
 'the',
 'ministry',
 'of']

In [16]:
 nltk.download('stopwords')
    

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dhanendra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# remove stop words from all docs 
stop_words = set(stopwords.words('english'))

for x in range(len(reuters.fileids())):
    filtered_sentence = [w for w in plot_data[0][x] if not w in stop_words]
    plot_data[0][x] = filtered_sentence

plot_data[0][1][0:10]

['china',
 'daily',
 'says',
 'vermin',
 'eat',
 '712',
 'pct',
 'grain',
 'stocks',
 'survey']

In [18]:
#stem words EXAMPLE (could try others/lemmers )

snowball_stemmer = SnowballStemmer("english")
stemmed_sentence = [snowball_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]

porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")
stemmed_sentence = [ porter_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]

['ltaha',
 'automot',
 'technolog',
 'corp',
 'year',
 'net',
 'shr',
 '43',
 'ct',
 'vs']

# PART II: CREATING THE INVERSE-INDEX

In [19]:
# Create inverse index which gives document number for each document and where word appears

#first we need to create a list of all words 
l = plot_data[0]
flatten = [item for sublist in l for item in sublist]
words = flatten
wordsunique = set(words)
wordsunique = list(wordsunique)

In [20]:
# create functions for TD-IDF / BM25
import math
from textblob import TextBlob as tb

def tf(word, doc):
    return doc.count(word) / len(doc)

def n_containing(word, doclist):
    return sum(1 for doc in doclist if word in doc)

def idf(word, doclist):
    return math.log(len(doclist) / (0.01 + n_containing(word, doclist)))

def tfidf(word, doc, doclist):
    return (tf(word, doc) * idf(word, doclist))

In [21]:
# Create dictonary of words
# THIS ONE-TIME INDEXING IS THE MOST PROCESSOR-INTENSIVE STEP AND WILL TAKE TIME TO RUN (BUT ONLY NEEDS TO BE RUN ONCE)
import re
import numpy as np

plottest = plot_data[0][0:1000]

worddic = {}

for doc in plottest:
    for word in wordsunique:
        if word in doc:
            word = str(word)
            index = plottest.index(doc)
            positions = list(np.where(np.array(plottest[index]) == word)[0])
            idfs = tfidf(word,doc,plottest)
            try:
                worddic[word].append([index,positions,idfs])
            except:
                worddic[word] = []
                worddic[word].append([index,positions,idfs])

In [22]:
# the index creates a dic with each word as a KEY and a list of doc indexs, word positions, and td-idf score as VALUES
worddic['china']

[[1, [0, 23], 0.1131500878815288],
 [13, [0], 0.06694713532990454],
 [14, [160], 0.013213250394060107],
 [28, [51], 0.05821490028687352],
 [40, [3, 15, 59, 79], 0.14740653650621185],
 [236, [86], 0.04414096834938761],
 [281, [70], 0.0565750439407644],
 [293, [13, 21], 0.11642980057374704],
 [302, [33], 0.059952658504392124],
 [342, [55, 146], 0.05391715597039292],
 [567, [2], 0.06925565723783228],
 [569, [1014, 1072, 1221], 0.009248261212112677],
 [612, [20], 0.01998421950146404],
 [710, [0, 7, 34], 0.17464470086062053],
 [720, [0, 16], 0.23628400704672192],
 [721, [0, 6, 27, 78, 82], 0.2028701070603168],
 [733, [179], 0.021595850106420823],
 [736, [0, 5, 21, 83], 0.13732745708698368]]

In [23]:
# pickel (save) the dictonary to avoid re-calculating
np.save('worddic_1000.npy', worddic)

# PART III: The Search Engine

In [24]:
# create word search which takes multiple words and finds documents that contain both along with metrics for ranking:

    ## (1) Number of occruances of search words 
    ## (2) TD-IDF score for search words 
    ## (3) Percentage of search terms
    ## (4) Word ordering score 
    ## (5) Exact match bonus 


from collections import Counter

def search(searchsentence):
    try:
        # split sentence into individual words 
        searchsentence = searchsentence.lower()
        try:
            words = searchsentence.split(' ')
        except:
            words = list(words)
        enddic = {}
        idfdic = {}
        closedic = {}
        
        # remove words if not in worddic 
        realwords = []
        for word in words:
            if word in list(worddic.keys()):
                realwords.append(word)  
        words = realwords
        numwords = len(words)
        
        # make metric of number of occurances of all words in each doc & largest total IDF 
        for word in words:
            for indpos in worddic[word]:
                index = indpos[0]
                amount = len(indpos[1])
                idfscore = indpos[2]
                enddic[index] = amount
                idfdic[index] = idfscore
                fullcount_order = sorted(enddic.items(), key=lambda x:x[1], reverse=True)
                fullidf_order = sorted(idfdic.items(), key=lambda x:x[1], reverse=True)

                
        # make metric of what percentage of words appear in each doc
        combo = []
        alloptions = {k: worddic.get(k, None) for k in (words)}
        for worddex in list(alloptions.values()):
            for indexpos in worddex:
                for indexz in indexpos:
                    combo.append(indexz)
        comboindex = combo[::3]
        combocount = Counter(comboindex)
        for key in combocount:
            combocount[key] = combocount[key] / numwords
        combocount_order = sorted(combocount.items(), key=lambda x:x[1], reverse=True)
        
        # make metric for if words appear in same order as in search
        if len(words) > 1:
            x = []
            y = []
            for record in [worddic[z] for z in words]:
                for index in record:
                     x.append(index[0])
            for i in x:
                if x.count(i) > 1:
                    y.append(i)
            y = list(set(y))

            closedic = {}
            for wordbig in [worddic[x] for x in words]:
                for record in wordbig:
                    if record[0] in y:
                        index = record[0]
                        positions = record[1]
                        try:
                            closedic[index].append(positions)
                        except:
                            closedic[index] = []
                            closedic[index].append(positions)

            x = 0
            fdic = {}
            for index in y:
                csum = []
                for seqlist in closedic[index]:
                    while x > 0:
                        secondlist = seqlist
                        x = 0
                        sol = [1 for i in firstlist if i + 1 in secondlist]
                        csum.append(sol)
                        fsum = [item for sublist in csum for item in sublist]
                        fsum = sum(fsum)
                        fdic[index] = fsum
                        fdic_order = sorted(fdic.items(), key=lambda x:x[1], reverse=True)
                    while x == 0:
                        firstlist = seqlist
                        x = x + 1
        else:
            fdic_order = 0
                    
        # also the one above should be given a big boost if ALL found together 
           
        
        #could make another metric for if they are not next to each other but still close 
        
        
        return(searchsentence,words,fullcount_order,combocount_order,fullidf_order,fdic_order)
    
    except:
        return("")


search('indonesia crude palm oil')[1]

['indonesia', 'crude', 'palm', 'oil']

In [25]:
# 0 return will give back the search term, the rest will give back metrics (see above)

search('indonesia crude palm oil')[1][1:10]

['crude', 'palm', 'oil']

In [26]:
# save metrics to dataframe for use in ranking and machine learning 
result1 = search('china daily says what')
result2 = search('indonesia crude palm oil')
result3 = search('price of nickel')
result4 = search('north yemen sugar')
result5 = search('nippon steel')
result6 = search('China')
result7 = search('Gold')
result8 = search('trade')
df = pd.DataFrame([result1,result2,result3,result4,result5,result6,result7,result8])
df.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order']
df

Unnamed: 0,search term,actual_words_searched,num_occur,percentage_of_terms,td-idf,word_order
0,china daily says what,"[china, daily, says]","[(183, 5), (40, 4), (569, 3), (710, 3), (342, ...","[(1, 1.0), (13, 0.6666666666666666), (14, 0.66...","[(675, 0.5095658223243495), (135, 0.4367707048...","[(1, 3), (293, 1), (720, 1), (721, 1), (736, 0..."
1,indonesia crude palm oil,"[indonesia, crude, palm, oil]","[(33, 13), (621, 12), (34, 11), (209, 8), (123...","[(4, 1.0), (6, 1.0), (209, 0.5), (281, 0.5), (...","[(762, 0.48707909813666866), (266, 0.434203698...","[(34, 6), (4, 5), (660, 5), (6, 4), (268, 2), ..."
2,price of nickel,"[price, nickel]","[(572, 19), (639, 8), (108, 7), (148, 7), (736...","[(724, 1.0), (4, 0.5), (7, 0.5), (20, 0.5), (2...","[(50, 0.24460301234499893), (537, 0.2066299280...","[(724, 0)]"
3,north yemen sugar,"[north, yemen, sugar]","[(700, 12), (96, 8), (494, 7), (296, 6), (525,...","[(30, 1.0), (758, 1.0), (47, 0.666666666666666...","[(494, 0.3808351739278394), (30, 0.35115970582...","[(758, 2), (30, 2), (851, 0), (47, 0)]"
4,nippon steel,"[nippon, steel]","[(40, 9), (253, 8), (444, 7), (223, 2), (435, ...","[(40, 1.0), (123, 0.5), (223, 0.5), (253, 0.5)...","[(223, 0.5682589478261134), (40, 0.42228417223...","[(40, 5)]"
5,china,[china],"[(721, 5), (40, 4), (736, 4), (569, 3), (710, ...","[(1, 1.0), (13, 1.0), (14, 1.0), (28, 1.0), (4...","[(720, 0.23628400704672192), (721, 0.202870107...",0
6,gold,[gold],"[(997, 6), (20, 5), (797, 5), (341, 4), (347, ...","[(8, 1.0), (12, 1.0), (20, 1.0), (32, 1.0), (2...","[(304, 0.30902054113001826), (20, 0.2575171176...",0
7,trade,[trade],"[(0, 15), (169, 10), (544, 10), (761, 8), (273...","[(285, 2.0), (701, 2.0), (713, 2.0), (923, 2.0...","[(223, 0.24728127372797265), (449, 0.247281273...",0


In [27]:
# look to see if the top documents seem to make sense

alldocslist[1]

'CHINA DAILY SAYS VERMIN EAT 712 PCT GRAIN STOCKS\n  A survey of 19 provinces and seven cities\n  showed vermin consume between seven and 12 pct of Chinas grain\n  stocks the China Daily said\n      It also said that each year 1575 mln tonnes or 25 pct of\n  Chinas fruit output are left to rot and 21 mln tonnes or up\n  to 30 pct of its vegetables The paper blamed the waste on\n  inadequate storage and bad preservation methods\n      It said the government had launched a national programme to\n  reduce waste calling for improved technology in storage and\n  preservation and greater production of additives The paper\n  gave no further details\n  \n\n'

# PART IV: Rank and return (rules based)

In [28]:
# create a simple (non-machine learning) rank and return function

def rank(term):
    results = search(term)

    # get metrics 
    num_score = results[2]
    per_score = results[3]
    tfscore = results[4]
    order_score = results[5]
    
    final_candidates = []

    # rule1: if high word order score & 100% percentage terms then put at top position
    try:
        first_candidates = []

        for candidates in order_score:
            if candidates[1] > 1:
                first_candidates.append(candidates[0])

        second_candidates = []

        for match_candidates in per_score:
            if match_candidates[1] == 1:
                second_candidates.append(match_candidates[0])
            if match_candidates[1] == 1 and match_candidates[0] in first_candidates:
                final_candidates.append(match_candidates[0])

    # rule2: next add other word order score which are greater than 1 

        t3_order = first_candidates[0:3]
        for each in t3_order:
            if each not in final_candidates:
                final_candidates.insert(len(final_candidates),each)

    # rule3: next add top td-idf results
        final_candidates.insert(len(final_candidates),tfscore[0][0])
        final_candidates.insert(len(final_candidates),tfscore[1][0])

    # rule4: next add other high percentage score 
        t3_per = second_candidates[0:3]
        for each in t3_per:
            if each not in final_candidates:
                final_candidates.insert(len(final_candidates),each)

    #rule5: next add any other top results for metrics
        othertops = [num_score[0][0],per_score[0][0],tfscore[0][0],order_score[0][0]]
        for top in othertops:
            if top not in final_candidates:
                final_candidates.insert(len(final_candidates),top)
                
    # unless single term searched, in which case just return 
    except:
        othertops = [num_score[0][0],num_score[1][0],num_score[2][0],per_score[0][0],tfscore[0][0]]
        for top in othertops:
            if top not in final_candidates:
                final_candidates.insert(len(final_candidates),top)

    for index, results in enumerate(final_candidates):
        if index < 5:
            print("RESULT", index + 1, ":", alldocslist[results][0:100],"...")

In [29]:
# example of output 
rank('indonesia palm oil')

RESULT 1 : INDONESIA SEES CPO PRICE RISING SHARPLY
  Indonesia expects crude palm oil CPO
  prices to rise shar ...
RESULT 2 : INDONESIAN COMMODITY EXCHANGE MAY EXPAND
  The Indonesian Commodity Exchange is
  likely to start tr ...
RESULT 3 : MALAYSIA MAY NOT MEET 1987 OIL PALM TARGET
  Malaysia is unlikely to meet its
  targeted output of f ...
RESULT 4 : SAUDI ARABIA SEEKING RBD PALM OLEIN
  Saudi Arabia is in the market for 4000
  tonnes of refined ble ...
RESULT 5 : SAUDI ARABIA BUYS RBD PALM OLEIN
  Saudi Arabia bought 4000 tonnes of
  Malaysian refined bleached d ...


In [30]:
# example of output 
rank('china')

RESULT 1 : CHINA CHILE TO BUILD COPPER TUBE PLANT IN CHINA
  Chinas stateowned Beijing NonFerrous
  Metals Indu ...
RESULT 2 : NIPPON STEEL DENIES CHINA SEEKING JAPANESE PLANTS
  Nippon Steel Corp ltNSTCT denied local
  newspap ...
RESULT 3 : CHINA RAISES GRAIN PURCHASE PRICES
  China has raised the state purchase
  prices of corn rice cotto ...
RESULT 4 : CHINA DAILY SAYS VERMIN EAT 712 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showe ...
RESULT 5 : CHINA SULPHURIRON MINE STARTS PRODUCTION
  Chinas largest sulphuriron mine has
  started trial produ ...


# PART V: Rank and return (machine learning)

In [None]:
# Create pseudo-truth set using first 5 words 
# Because I don't have a turth set I will generate a pseudo one by pulling terms from the documents - this is far from perfect
     # as it may not approximate well peoples actual queries but it will serve well to build the ML architecture 

df_truth = pd.DataFrame()

for doc in plottest:
    first_five = doc[0:5]
    test_sentence = ' '.join(first_five)
    result = search(test_sentence)
    df_temp = pd.DataFrame([result])
    df_truth= pd.concat([df_truth, df_temp])

df_truth['truth'] = range(0,len(plottest))

In [None]:
# create another psuedo-truth set using random 3 word sequence from docs

df_truth1 = pd.DataFrame()
seqlen = 3

for doc in plottest:
    try:
        start = random.randint(0,(len(doc)-seqlen))
        random_seq = doc[start:start+seqlen]
        test_sentence = ' '.join(random_seq)
    except:
        test_sentence = doc[0]
    result = search(test_sentence)
    df_temp = pd.DataFrame([result])
    df_truth1= pd.concat([df_truth1, df_temp])

df_truth1['truth'] = range(0,len(plottest))

In [None]:
# create another psuedo-truth set using different random 4 word sequence from docs

df_truth2 = pd.DataFrame()
seqlen = 4

for doc in plottest:
    try:
        start = random.randint(0,(len(doc)-seqlen))
        random_seq = doc[start:start+seqlen]
        test_sentence = ' '.join(random_seq)
    except:
        test_sentence = doc[0]
    result = search(test_sentence)
    df_temp = pd.DataFrame([result])
    df_truth2= pd.concat([df_truth2, df_temp])

df_truth2['truth'] = range(0,len(plottest))

In [None]:
# create another psuedo-truth set using different random 2 word sequence from docs

df_truth3 = pd.DataFrame()
seqlen = 2

for doc in plottest:
    try:
        start = random.randint(0,(len(doc)-seqlen))
        random_seq = doc[start:start+seqlen]
        test_sentence = ' '.join(random_seq)
    except:
        test_sentence = doc[0]
    result = search(test_sentence)
    df_temp = pd.DataFrame([result])
    df_truth3= pd.concat([df_truth3, df_temp])

df_truth3['truth'] = range(0,len(plottest))

In [None]:
# combine the truth sets and save to disk 
truth_set = pd.concat([df_truth,df_truth1,df_truth2,df_truth3])
truth_set.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order','truth']
truth_set.to_csv("truth_set_final.csv")

In [None]:
truth_set[0:11]

In [None]:
truth_set
test_set = truth_set[0:3]
test_set

In [None]:
# convert to long format for ML 
# WARNING AGAIN THIS IS A SLOW PROCESS DUE TO RAM ILOC - COULD BE OPTIMISED FOR FASTER PERFORMANCE 
# BUG When min(maxnum, len(truth_set) <- is a int not a list because of very short variable length)

# row is row
# column is variable
# i is the result 

final_set =  pd.DataFrame()
test_set = truth_set[1:100]
maxnum = 5

for row in range(0,len(test_set.index)):
    test_set = truth_set[1:100]
    for col in range(2,6):
        for i in range(0,min(maxnum,len(truth_set.iloc[row][col]))):
            x = pd.DataFrame([truth_set.iloc[row][col][i]])
            x['truth'] = truth_set.iloc[row]['truth']
            x.columns = [(str(truth_set.columns[col]),"index",i),(str(truth_set.columns[col]),"score",i),'truth']
            test_set = test_set.merge(x,on='truth')
    final_set = pd.concat([final_set,test_set])
        
final_set.head()

In [None]:
final_set.to_csv("ML_set_100.csv")

In [None]:
final_set2 = final_set.drop(['actual_words_searched','num_occur','percentage_of_terms','search term','td-idf','word_order'], 1)
final_set2.to_csv("ML_set_100_3.csv")
final_set2.head()

In [None]:
final_set3 = final_set2
final_set3[0:10]

In [None]:
# Load libraries 
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as sm
import statsmodels.api as sma
from statsmodels.tools.eval_measures import mse
from statsmodels.tools.tools import add_constant

from sklearn import linear_model, feature_selection,preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [None]:
final_set3['y'] = final_set3['truth']
final_set3 = final_set3.drop(['truth'], 1)
final_set3

In [None]:
data = final_set3
data.corr()['y']

In [None]:
data['a'] = data[data.columns[0]]
data['b'] = data[data.columns[10]]
data['c'] = data[data.columns[20]]
data['d'] = data[data.columns[30]]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data

train,test = train_test_split(X,train_size=0.80)

model = sm.ols(formula='y ~ 1 + a + b + c + d', 
               data=train).fit()

modelforout = model 

model.summary()

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
fig = sma.graphics.influence_plot(modelforout, ax=ax, criterion="cooks")

In [None]:
res = model.resid # residuals
fig = sma.qqplot(res)
plt.show()

# PART VI: FINAL GUI

In [None]:
term = input("search: ")
rank(term)

In [None]:
term = input("search: ")
try:
    result = rank(term)
    result
    feedback = input("were these articles helpful?, (Y/N): ")
    if feedback == "Y":
        np.save('correct_search.npy', worddic) 
    elif feedback == "exit":
    else:
        print("sorry it was not helpful, try again")


# TO-DO / Improvements

Indexer:
- Improve stem/lemm
- Add new metrics (e.g. bonus for exact matches / closeness metric)
- Add BM25 (and variants)

Search Engine:
- Add query expansion / synonyms
- Add spellchecker functions
- Add confidence level 

Data sources:
- Find another source with a proper truth set
- Download wikipedia and try it with this 

Machine Learning:
- Fix ML example compiler (crashes if len(col) is so short it is an int and so no len function)
- Try different algorithms 

GUI:
- Build GUI interface
- Add feedback mechanism