In [4]:
import os

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib


if os.path.exists(os.path.join(".", "all_rows.pqt")):
    all_rows = pd.read_parquet("all_rows.pqt")
else:
    all_rows = pd.DataFrame(
        columns=["id", "link", "name", "rate", "description", "reviews"]
    )

    for name in os.listdir():
        if name.endswith(".csv") and name.startswith("rows"):
            temp_df = pd.read_csv(name)
            all_rows = pd.concat([all_rows, temp_df])

    all_rows.to_parquet("all_rows.pqt", index=False)

if os.path.exists(os.path.join(".", "vectorizer.pkl")) and os.path.exists(os.path.join('.', 'tfidf_vectors.npy')):
    print('Vectorizer loaded')
    vectorizer = joblib.load('vectorizer.pkl')

    tfidf_vectors = np.load("tfidf_vectors.npy", allow_pickle=True).item()
else:
    print('Vectorizer created')
    vectorizer = TfidfVectorizer()

    tfidf_vectors = vectorizer.fit_transform(all_rows.name)

    joblib.dump(vectorizer, 'vectorizer.pkl')
    np.save('tfidf_vectors', tfidf_vectors)


def cosine_search(new_sentence):
    new_tfidf_vector = vectorizer.transform([new_sentence])

    new_tfidf_vector_array = new_tfidf_vector.toarray()

    euclidean_dist = cosine_similarity(
        new_tfidf_vector_array.reshape(1, -1), tfidf_vectors
    )

    return all_rows.iloc[np.argmax(euclidean_dist)]

Vectorizer created


In [5]:
cosine_search("артур конан дойл")

id                                                        438397
link                       https://www.labirint.ru/books/438397/
name                                  Артур Дойл: Затерянный мир
rate                                                        9.19
description    Большинство читателей знают сэра Артура Конан ...
reviews        Хорошая научная фантастика.\r\nХороший перевод...
Name: 665, dtype: object

In [6]:
import pandas as pd

import os

In [7]:
all_rows = pd.DataFrame(columns = ['id', 'link', 'name', 'rate', 'description', 'reviews'])
all_rows

Unnamed: 0,id,link,name,rate,description,reviews


In [8]:
for name in os.listdir():
    if name.endswith('.csv') and name.startswith('rows'):
        temp_df = pd.read_csv(name)
        all_rows = pd.concat([all_rows, temp_df])

In [9]:
all_rows.shape

(8685, 6)

In [10]:
all_rows.to_parquet('all_rows.pqt', index=False)

In [11]:
os.path.exists(os.path.join('.', 'tfidf_vectors.npy'))

True

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()

tfidf_vectors = vectorizer.fit_transform(all_rows.name)

In [17]:
tfidf_vectors.shape

(8685, 20283)

In [18]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity

import numpy as np

In [22]:
# Новое предложение
new_sentence = '''Сенека
Наставник императора'''

new_tfidf_vector = vectorizer.transform([new_sentence])

new_tfidf_vector_array = new_tfidf_vector.toarray()

euclidean_dist = cosine_similarity(new_tfidf_vector_array.reshape(1, -1), tfidf_vectors)

all_rows.iloc[np.argmax(euclidean_dist)]

id                                                       1010629
link                      https://www.labirint.ru/books/1010629/
name              Анатолий Ильяхов: Сенека. Наставник императора
rate                                                         0.0
description    Когда-то в молодости известный римский философ...
reviews                                                      NaN
Name: 21, dtype: object

In [88]:
tfidf_vectors.shape

(7229, 17140)

In [90]:
euclidean_dist = euclidean_distances(new_tfidf_vector_array.reshape(1, -1), tfidf_vectors)

In [91]:
import numpy as np

np.argmin(euclidean_dist)

3491

In [93]:
all_rows.iloc[3491]

id                                                        493711
link                       https://www.labirint.ru/books/493711/
name                                      Олег Михайлов: Кутузов
rate                                                         0.0
description    Исторический роман посвящен Михаилу Илларионов...
reviews                                                      NaN
Name: 908, dtype: object

In [72]:
import requests


def ocr_space_file(filename, overlay=False, api_key='helloworld', language='eng'):
    """ OCR.space API request with local file.
        Python3.5 - not tested on 2.7
    :param filename: Your file path & name.
    :param overlay: Is OCR.space overlay required in your response.
                    Defaults to False.
    :param api_key: OCR.space API key.
                    Defaults to 'helloworld'.
    :param language: Language code to be used in OCR.
                    List of available language codes can be found on https://ocr.space/OCRAPI
                    Defaults to 'en'.
    :return: Result in JSON format.
    """

    payload = {'isOverlayRequired': overlay,
               'apikey': api_key,
               'language': language,
               }
    with open(filename, 'rb') as f:
        r = requests.post('https://api.ocr.space/parse/image',
                          files={filename: f},
                          data=payload,
                          )
    return r.json()

In [73]:
res = ocr_space_file('test.png', api_key='K81646083088957', language='rus')

In [80]:
res["ParsedResults"][0]["ParsedText"].strip()

'с.михплков\r\nи ск.зни'

In [70]:
eval(res)

NameError: name 'false' is not defined

In [None]:
["ParsedText']