In [1]:
import json
import regex as re

In [2]:
results = []
with open('lionel.json') as user_file:
    for line in user_file:
        results = json.loads(line)


In [3]:
results

[{'URL': 'http://www.lionel.com/',
  'Title': "Lionel Trains: World's Best Model Trains & Railroad",
  'Summary': 'LionChief Plus. Lionel Legacy. American Flyer. VisionLine.'},
 {'URL': 'https://www.lionelstore.com/',
  'Title': 'Lionel Model Trains - Electric Trains & more at the Lionel Train Store',
  'Summary': 'Lionel Model Trains - Electric Trains & more at the Lionel Train Store.'},
 {'URL': 'http://www.lionel.com/catalogs',
  'Title': 'Train Gifts & Collectibles: The Lionel Trains Catalog',
  'Summary': "Lionel's catalogs are the ultimate destination for model train collectors. Browse the history of Lionel Trains catalogs, with beautiful high-res images."},
 {'URL': 'https://www.lionelracing.com/',
  'Title': 'Lionel Racing NASCAR Store: Diecast, Collectibles & Apparel',
  'Summary': 'Dale Earnhardt, Jr. ... Martin Truex, Jr. ... Ricky Stenhouse, Jr. Ross Chastain · Ryan Blaney · Ryan Ellis · Ryan Preece · Ryan Truex\xa0...'},
 {'URL': 'https://lionelrichie.com/',
  'Title': 'Li

In [4]:
pattern = re.compile(r'''
’s|’t|’re|’ve|’m|’ll|’d| ?\p{L}+| ?\p{N}+|\s+”
''')
# def get_words(string):

    

In [5]:
stop_words = set()
with open('../stop_words.txt', 'r') as stop_words_file:
    for line in stop_words_file:
        stop_words.add(line.strip())

In [6]:
import regex as re
pattern = re.compile(r'''
’s|’t|’re|’ve|’m|’ll|’d| ?\p{L}+| ?\p{N}+”
''')

def extract_words(result_list):
    word_lists = []
    vocab = set()
    for result in result_list:
        title = result.get('Title', '')
        snippet = result.get('Summary', '')
        title_words = re.findall(pattern, title.lower())
        title_words = [word.strip() for word in title_words if word.strip() not in stop_words]
        snippet_words = re.findall(pattern, snippet.lower())
        snippet_words = [word.strip() for word in snippet_words if word.strip() not in stop_words]
        vocab.update(title_words + snippet_words)
        word_lists.append({'title': title_words, 'summary': snippet_words})
    return word_lists, vocab

In [7]:
wl, vocab = extract_words(results)

In [8]:
wl[0]

{'title': ['lionel', 'trains', 'world', 'best', 'model', 'trains', 'railroad'],
 'summary': ['lionchief',
  'plus',
  'lionel',
  'legacy',
  'american',
  'flyer',
  'visionline']}

In [9]:
feedback = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]

In [10]:
inverse_list = {word: set() for word in vocab}

for i, document in enumerate(wl):
    for word in document['title'] + document['summary']:
        inverse_list[word].add(i+1)
        

In [11]:
all_documents = set(range(1, 10+1))

In [12]:
def gini_impurity(word, docs_with_word, docs_without_word, feedback):
  relevant_docs_with_word = 0
  relevant_docs_without_word = 0
  
  # import pdb; pdb.set_trace()
  for doc in docs_with_word:
    relevant_docs_with_word += feedback[doc]
  for doc in docs_without_word:
    relevant_docs_without_word += feedback[doc]

  prob_relevant_with_word = relevant_docs_with_word/len(docs_with_word)
  prob_irrelevant_with_word = 1 - prob_relevant_with_word
  
  prob_relevant_without_word = relevant_docs_without_word/len(docs_without_word)
  prob_irrelevant_without_word = 1 - prob_relevant_without_word
  # import pdb; pdb.set_trace()
  impurity_with_word = 1 - (prob_relevant_with_word**2 + prob_irrelevant_with_word**2)
  impurity_without_word = 1 - (prob_relevant_without_word**2 + prob_irrelevant_without_word**2)

  return impurity_with_word, impurity_without_word

In [13]:
percentage_of_relevant_docs = {}

for word, docs in inverse_list.items():
    number_of_relevant_docs = 0
    for doc in docs:
        number_of_relevant_docs += feedback[doc - 1]
    percentage_of_relevant_docs[word] = number_of_relevant_docs/len(docs)

In [14]:
k = 0.6

words_to_search = [word for word in vocab if percentage_of_relevant_docs[word]>k]

In [15]:
len(words_to_search)

15

In [16]:
ranking = {}

for word in words_to_search:
    gini = gini_impurity(word, inverse_list[word], all_documents - inverse_list[word], feedback = {k+1:f for k, f in enumerate(feedback)})
    w1 = len(inverse_list[word])/len(all_documents)
    w2 = 1.0 - w1
    ranking[word] = ( w1*gini[0] + w2*gini[1])

In [17]:

for doc in wl:
    for word in doc['title']:
        if word in words_to_search:
            ranking[word] = ranking[word]*0.95


In [18]:
res = sorted(words_to_search, key= lambda x: ranking[x])

In [19]:
{word: ranking[word] for word in res}

{'captaincy': 0.0,
 'signed': 0.0,
 'messi': 0.0,
 'french': 0.0,
 'wikipedia': 0.0,
 'assumed': 0.0,
 'germain': 0.0,
 'barcelona': 0.0,
 'club': 0.0,
 'paris': 0.0,
 'sixth': 0.0,
 'ballon': 0.0,
 'saint': 0.0,
 'record': 0.0,
 'contract': 0.0}

In [20]:
res

['captaincy',
 'signed',
 'messi',
 'french',
 'wikipedia',
 'assumed',
 'germain',
 'barcelona',
 'club',
 'paris',
 'sixth',
 'ballon',
 'saint',
 'record',
 'contract']