In [13]:
import os
import re
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

path = 'data'
files = []


def get_text_list():
    df = pd.read_csv('1.csv', nrows=100, encoding='utf-8')
#     files = files = df['title'].values.tolist()
    return df['text'].values.tolist()


def get_bag(texts):
    bag = CountVectorizer(token_pattern='\\b[A-Za-z]+\\b')
    count = bag.fit_transform(texts)
    return bag, count


def generate_inverse_index(text_list, bag, array):
    result = defaultdict(list)
    words = bag.get_feature_names()
    for index, value in enumerate(text_list):
        for i, word in enumerate(words):
            if array[index][i] != 0:
                position_list = [m.span() for m in re.finditer(
                    r'\b' + word + r'\b', value)]
                result[word].append((index, array[index][i], position_list))
    return result


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import math


class ResultItem:
    def __init__(self, index, name, text):
        self.index = index
        self.name = name
        self.head = text.split('\n')[0]
        self.text = text
        self.rank = 0.0
        self.freq = 0
        self.count = 0
        self.occurrence = []
        self.similarity = 0.0

    def __str__(self):
        s = "file: " + self.name + \
            "\nhead: " + self.head + \
            "\nfreq: " + str(self.freq) + \
            "\nrank: " + str(self.rank) + \
            "\nsimilarity: " + str(self.similarity) + \
            "\n"
        for j in self.occurrence:
            s += "> ..." + self.text[max(0, j[0] - 50):j[0] + 50] + "...\n"
        return s


def get_similarity(a, b):
    dot = 0
    len_a = 0
    len_b = 0
    for i in range(len(a)):
        dot += a[i] * b[i]
        len_a += a[i] * a[i]
        len_b += b[i] * b[i]
    len_a = math.sqrt(len_a)
    len_b = math.sqrt(len_b)
    return dot / (len_a * len_b)


def run_search(search_str, inverse_index, file_names, texts, bag, array):
    temp = []
    freq = []
    s_list = search_str.split(' ')
    for s in s_list:
        temp.append(inverse_index[s].copy())
        freq.append(0)
        if inverse_index[s]:
            for i in inverse_index[s]:
                freq[-1] += i[1]
    result_dict = {}
    for index, i in enumerate(temp):
        if not i:
            continue
        for j in i:
            if j[0] not in result_dict:
                item = ResultItem(j[0], file_names[j[0]], texts[j[0]])
                item.count += 1
                item.freq += j[1]
                item.rank += j[1] * 100 / freq[index]
                item.occurrence.extend(j[2])
                result_dict[j[0]] = item
            else:
                result_dict[j[0]].count += 1
                result_dict[j[0]].freq += j[1]
                result_dict[j[0]].rank += j[1] * 100 / freq[index]
                result_dict[j[0]].occurrence.extend(j[2])
    result_list = [i for i in result_dict.values()]

    search_vec = CountVectorizer(vocabulary=bag.get_feature_names()).fit_transform([search_str]).toarray()
    for i in result_list:
        i.similarity = get_similarity(search_vec[0], array[i.index].A[0])

    result_list.sort(key=lambda x: -x.rank * x.count)
    return result_list


In [None]:

print("Loading data...")
text_list = get_text_list()
print("Vectorizing...")
bag, count = get_bag(text_list)
print("Generating index...")
inverse_index = generate_inverse_index(text_list, bag, count.toarray())
print("Done. Type to search now.")
df = pd.read_csv('1.csv', nrows=100, encoding='utf-8')
files = files = df['title'].values.tolist()
while True:
    search_str = input("> ")

    if search_str == 'q':
        print('Bye :)')
        exit(0)

    result = run_search(search_str, inverse_index, files, text_list, bag, count)
    for i in result:
        print(i)
        s = input("n to next, q to quit, g to recommend, d to not recommend\n> ")
        if s == 'q':
            break
        else:
            continue

Loading data...
Vectorizing...
Generating index...
Done. Type to search now.
> health
file: Sunlight — The Natural Supplement For Our Mental Health
head: Sunlight — The Natural Supplement For Our Mental Health
freq: 12
rank: 20.689655172413794
similarity: 0.0969414129169833
> ...of these are our attempts at improving our mental health. More than ever, this generation has spoken...
> ...ever, this generation has spoken out about mental health issues and have taken considerable steps in...
> ...able steps in order to try and improve our mental health.

Yet, amongst the many techniques that see...
> ...otonin. But what does that have to do with mental health?

Serotonin (also known as 5-hydroxytryptam...
> ...ysical well being is tied closely to one’s mental health, getting a healthy dose of vitamin D helps ...
> ...your mental wellness by maintaining your physical health.

Besides physical health, vitamin D also h...
> ...intaining your physical health.

Besides physical health, vitamin 

In [None]:
COVID-19