# Dataset and preprocessing
我们用的数据集是来自medium.com的文章，共有190000多篇，内容包括科技、社会、健康、环境保护和生活方式等主题，我们取其中的100篇作为这次实验的数据集。这个数据集为一个csv文件，每一行包括文章标题、文章文本、文章url、作者、发布时间和文章的标签等。在这次实验中，我们只使用到文章的文本。

在预处理部分，我们要把文本中的标点符号、数字、特殊符号等去掉，然后将文本中的单词转换为小写，并且将文本中的单词分割为一个一个的词语。sklearn中的CountVectorizer类可以帮助我们实现这个功能。接下来我们要构建文本的空间向量


In [1]:
import os
import re
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

# path = 'data'
# files = []


def get_text_list():
    df = pd.read_csv('1.csv', nrows=265, encoding='utf-8')
#     files = files = df['title'].values.tolist()
#     print(df[['title', 'text', 'url', 'timestamp']].values.tolist())
    return df['text'].values.tolist()


def get_bag(texts):
    bag = CountVectorizer(token_pattern='\\b[A-Za-z]+\\b')
    count = bag.fit_transform(texts)
    return bag, count


def generate_inverse_index(text_list, bag, array):
    result = defaultdict(list)
    words = bag.get_feature_names_out()
    for index, value in enumerate(text_list):
        for i, word in enumerate(words):
            if array[index][i] != 0:
                position_list = [m.span() for m in re.finditer(
                    r'\b' + word + r'\b', value)]
                result[word].append((index, array[index][i], position_list))
    return result


# Get scores and search


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import math


class ResultItem:
    def __init__(self, index, title, text, url, timestamp):
        self.index = index
        self.title = title
        self.text = text
        self.url = url
        self.timestamp = timestamp
        self.rank = 0.0
        self.freq = 0
        self.count = 0
        self.occurrence = []
        self.similarity = 0.0

    def __str__(self):
        s = "file_index: " + str(self.index) + \
            "\ntitle: " + self.title + \
            "\nurl: " + self.url + \
            "\ntimestamp: " + self.timestamp + \
            "\nfreq: " + str(self.freq) + \
            "\nrank: " + str(self.rank) + \
            "\nsimilarity: " + str(self.similarity) + \
            "\n"
        for j in self.occurrence:
            s += "> ..." + self.text[max(0, j[0] - 50):j[0] + 50] + "...\n"
        return s


def get_similarity(a, b):
    dot = 0
    len_a = 0
    len_b = 0
    for i in range(len(a)):
        dot += a[i] * b[i]
        len_a += a[i] * a[i]
        len_b += b[i] * b[i]
    len_a = math.sqrt(len_a)
    len_b = math.sqrt(len_b)
    return dot / (len_a * len_b)


def run_search(search_str, inverse_index, metadata, texts, bag, array):
    temp = []
    freq = []
    s_list = search_str.split(' ')
    for s in s_list:
        temp.append(inverse_index[s].copy())
        freq.append(0)
        if inverse_index[s]:
            for i in inverse_index[s]:
                freq[-1] += i[1]
    result_dict = {}
    for index, i in enumerate(temp):
        if not i:
            continue
        for j in i:
            if j[0] not in result_dict:
                item = ResultItem(j[0], metadata[j[0]][0], texts[j[0]], metadata[j[0]][1], metadata[j[0]][2])
                item.count += 1
                item.freq += j[1]
                item.rank += j[1] * 100 / freq[index]
                item.occurrence.extend(j[2])
                result_dict[j[0]] = item
            else:
                result_dict[j[0]].count += 1
                result_dict[j[0]].freq += j[1]
                result_dict[j[0]].rank += j[1] * 100 / freq[index]
                result_dict[j[0]].occurrence.extend(j[2])
    result_list = [i for i in result_dict.values()]

    search_vec = CountVectorizer(vocabulary=bag.get_feature_names_out()).fit_transform([search_str]).toarray()
    for i in result_list:
        i.similarity = get_similarity(search_vec[0], array[i.index].A[0])

    result_list.sort(key=lambda x: -x.rank * x.count)
    return result_list


In [4]:
import IPython.display as display
print("Loading data...")
text_list = get_text_list()
print("Vectorizing...")
bag, count = get_bag(text_list)
print("Generating index...")
inverse_index = generate_inverse_index(text_list, bag, count.toarray())
print("Done.")
df = pd.read_csv('1.csv', nrows=265, encoding='utf-8')
metadata = df[['title', 'url', 'timestamp']].values.tolist()
while True:
    search_str = input("Type your keyword to search now. Type q to exit.\n> ")
    display.clear_output()
    if search_str == 'q':
        print('Bye :)')
        break
    
    result = run_search(search_str, inverse_index, metadata, text_list, bag, count)
    for i in result:
        print(i)

Bye :)
