In [None]:
import re
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import spacy
from random import randint
from time import sleep

In [None]:
!python -m spacy download ru_core_news_sm

In [None]:
class Parser:
    
    def __init__(self):
        self.spacy_nlp = spacy.load('ru_core_news_sm')
    
    def has_endl(self, s):
        return bool(re.search(r'\n', s))
    
    def check_token(self, token):
        return not token.is_stop and not token.is_punct and not token.like_num and \
               token.text not in ['\xa0', '\n', '\t', '-'] and not self.has_endl(token.text)

    def get_score(self, soup):
        reviews = soup.findAll('span', class_='UserAvatar__number')
        scores = [re.sub('\D', '', reviews[i].text) for i in range(len(reviews))]
        return list(map(int, scores))
    
    def get_mobile_data(self, mobile):
        mobile_link = mobile.find('a').get('href')
        name = mobile.findAll('a')[1].get('title')
        mobile_url = mobile_link.split('?')
        return name, mobile_url
    
    def get_mobiles(self):   
        self.mobile_list = []  
        
        for i in tqdm(range(1, 17)):
            
            params = {'p': i}
            url = "https://www.citilink.ru/catalog/smartfony/"
            html = requests.get(url, params, headers={'User-Agent': 'BlackBerry9700/5.0.0.862 Profile/MIDP-2.1 Configuration/CLDC-1.1 VendorID/331 UNTRUSTED/1.0 3gpp-gba'}).text
            if html:
                soup = BeautifulSoup(html, 'html.parser')     
                mobiles = soup.findAll(['div'], class_=['product_data__gtm-js product_data__pageevents-js ProductCardHorizontal js--ProductCardInListing js--ProductCardInWishlist'])
                
                for mobile in mobiles:
                    name, mobile_url = self.get_mobile_data(mobile)
                    if mobile_url[0] not in self.mobile_list:
                        mobile_name = mobile_url[0].split('/')                    
                        self.mobile_list.append({
                            'url': mobile_url[0],
                            'name': name,
                            'id': mobile_name[2]
                        }) 
                
            sleep(randint(0, 1))
            print(len(self.mobile_list))
            
    def get_reviews(self, start, end):
        self.reviews_list = []  
        for i in tqdm(range(start, end)):
            url = 'https://www.citilink.ru' + self.mobile_list[i]['url'] + 'otzyvy/'
            html = requests.get(url,  headers={'User-Agent': 'Mozilla/5.0'})
            
            if html.status_code == 200:
                html = html.text
                soup = BeautifulSoup(html, 'html.parser')
                reviews = soup.findAll('div', class_='js--Opinion Opinion') + soup.findAll('div', class_='js--Opinion Opinion Opinion_best')
                scores = self.get_score(soup)
                
                for i in range(len(reviews)):
                    if (len(reviews[i].select('p')) != 3) or (scores[i] == 3):
                        continue
                    if scores[i] > 3:
                        doc = reviews[i].select('p')[0].text
                    if scores[i] < 3:
                        doc = reviews[i].select('p')[1].text
                    doc = self.spacy_nlp(doc)
                    text = ' '.join([token.text.lower() for token in doc if self.check_token(token)])
                    self.reviews_list.append({
                                        'text': text,
                                        'score': scores[i]
                                    })
                    
                sleep(randint(0, 1))
                
        return self.reviews_list

    

In [None]:
all_data = []
p = Parser()
p.get_mobiles()
step = 710 // 20
for i in range(21):
    all_data.append(p.get_reviews(i * step, (i + 1) * step))
    print(f'i: {i}, len: {len(all_data[i])}')

In [None]:
squeezed_data = []
for batch in all_data:
    for sample in batch:
        squeezed_data.append(sample)
df = pd.DataFrame.from_dict(squeezed_data)

In [None]:
df.to_csv('parsed_dataset', sep='\t')