In [214]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import pymorphy2
morph = pymorphy2.MorphAnalyzer()


# Парсинг

In [83]:
companies = []

for i in range(1, 10):
    url1 = 'https://www.yell.ru/moscow/top/lechebnye-kosmetologii/?page=' + str(i)
    response1 = requests.get(url1)
    soup1 = BeautifulSoup(response1.text, 'lxml')
    webs = soup1.find_all('a', class_='companies__item-title-text')
    for j in range(len(webs)):
        companies.append('https://www.yell.ru'+str(webs[j].get('href'))+'reviews/')


In [102]:
companies[:5]


['https://www.yell.ru/moscow/com/centr-krasoty-zazerkale_6522021/reviews/',
 'https://www.yell.ru/moscow/com/centr-kosmetologii-virsaviya_11971120/reviews/',
 'https://www.yell.ru/moscow/com/sm-stomatologiya-na-ulitse-kosmonavta-volkova_11893311/reviews/',
 'https://www.yell.ru/moscow/com/institut-zdorovya-i-krasoty-niarmedik_12464505/reviews/',
 'https://www.yell.ru/moscow/com/beauty-center-through-the-looking-glass_2687945/reviews/']

In [None]:
# пусть хорошие отзывы имеют оценки 4-5, а плохие 0-3

good_reviews = []
bad_reviews = []

for url in companies:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    quotes = soup.find_all('div', class_='reviews__item')
    for quote in quotes:
        text_pattern = 'data-review=\'\{([^}]+)'
        mark_pattern = 'rating__value\">\n        ([^ ]+)'
        text = re.findall(text_pattern, str(quote))
        txt = re.findall('\"text\":\"([^\"]+)', text[0])
        mark = re.findall(mark_pattern, str(quote))
        try:
            if int(mark[0]) >= 4:
                good_reviews.append(txt[0])
            else:
                bad_reviews.append(txt[0])
        except:
            
            break


In [387]:
print('Хороших отзывов:', len(good_reviews), '\nПлохих отзывов:', len(bad_reviews))


Хороших отзывов: 3635 
Плохих отзывов: 593


In [394]:
main_bad_reviews = bad_reviews[:500]
main_good_reviews = good_reviews[:500]

bad_rev_check = bad_reviews[500:593]
good_rev_check = good_reviews[100:700]

# Кажется, то, что количество хороших и плохих отзывов отличается, никак не влияет на подсчет accuracy.

check_reviews = []
check_reviews.extend(bad_rev_check)
check_reviews.extend(good_rev_check)

print('Хороших отзывов:', len(main_bad_reviews), '\nПлохих отзывов:', len(main_good_reviews), '\nОтзывов для проверки:', len(check_reviews))


Хороших отзывов: 500 
Плохих отзывов: 500 
Отзывов для проверки: 693


# Препроцессинг

In [299]:
def get_list_of_lemmatized_words(reviews):
    
    reviews_words = []
    
    for review in reviews:
        review = re.sub(r'([^\w\s]|\d|_)', '', review).lower()
        review = re.sub('n|u|rn', ' ', review) # почему-то между слов периодически встречается такая пара символов
        
        tokenized_review = word_tokenize(review, 'russian')
        for word in tokenized_review:
            w = morph.parse(word)[0].normal_form
            reviews_words.append(w)
            
    return reviews_words


In [302]:
bad_reviews_words = get_list_of_lemmatized_words(main_bad_reviews)
good_reviews_words = get_list_of_lemmatized_words(main_good_reviews)


In [237]:
print('Слов в плохих отзывах:', len(bad_reviews_words), '\nСлов в хороших отзывах:', len(good_reviews_words))


Слов в плохих отзывах: 36255 
Слов в хороших отзывах: 24272


In [330]:
def find_freq(words):
    
    freq_reviews_words = {}
    
    for word in words:
        try:
            freq_reviews_words[word] += 1
        except:
            freq_reviews_words[word] = 1
            
    return freq_reviews_words
        

In [332]:
freq_bad_reviews_words = find_freq(bad_reviews_words)
freq_good_reviews_words = find_freq(good_reviews_words)


In [336]:
words_bad_review_only = list(set(bad_reviews_words) - set(good_reviews_words))
words_good_review_only = list(set(good_reviews_words) - set(bad_reviews_words))


In [338]:
print('Уникальных слов в плохих отзывах:', len(words_bad_review_only), '\nУникальных слов в хороших отзывах:', len(words_good_review_only))


Уникальных слов в плохих отзывах: 3594 
Уникальных слов в хороших отзывах: 1481


In [366]:
def check_frequency(all_words, unique_words, freqq_words):
    
    needed_words = []
    
    for k, v in freqq_words.items():
        if str(k) in unique_words and int(v) > 2:
            needed_words.append(k)
            
    return needed_words
            

In [367]:
print(len(bad_reviews_words), len(words_bad_review_only))


36255 3594


In [368]:
needed_words_in_bad_reviews = check_frequency(bad_reviews_words, words_bad_review_only, freq_bad_reviews_words)
needed_words_in_good_reviews = check_frequency(good_reviews_words, words_good_review_only, freq_good_reviews_words)


# Оценка

In [396]:
def check_unknown_comments(needed_words_bad, needed_words_good, comments):
    
    part_of_matches = {}
    
    for comment in comments:
        match_words_good = 0
        match_words_bad = 0
        for word_good in needed_words_good:
            if word_good in comment:
                match_words_good += 1
        for word_bad in needed_words_bad:
            if word_bad in comment:
                match_words_bad += 1
                
        if match_words_bad > match_words_good:
            part_of_matches[comment] = 'bad'
        if match_words_bad < match_words_good:
            part_of_matches[comment] = 'good'
        else:
            part_of_matches[comment] = 'unknown'
            
    return part_of_matches


In [407]:
first_part = check_unknown_comments(needed_words_in_bad_reviews, needed_words_in_good_reviews, comment)

true_bad = 0
false_bad = 0

true_good = 0
false_good = 0

for bad_com in bad_rev_check:
    try:
        if first_part[bad_com]:
            true_bad += 1
    except:
        false_bad += 1
    
for good_com in good_rev_check:
    try:
        if first_part[good_com]:
            true_good += 1
    except:
        false_good += 1
        
accuracy = (true_bad+true_good)/(true_bad+true_good+false_bad+false_good)*100

print('Accuracy:', accuracy)


Accuracy: 0.0
