In [15]:
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument


sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
word2vec_model = Word2Vec(sentences, min_count=1)


vector = word2vec_model.wv['dog']
print("Word2Vec model:\n", vector)


documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
doc2vec_model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)


vector = doc2vec_model.infer_vector(["cat", "say", "meow"])
print("\nDoc2Vec model:\n", vector)

Word2Vec model:
 [ 9.4563962e-05  3.0773198e-03 -6.8126451e-03 -1.3754654e-03
  7.6685809e-03  7.3464094e-03 -3.6732971e-03  2.6427018e-03
 -8.3171297e-03  6.2054861e-03 -4.6373224e-03 -3.1641065e-03
  9.3113566e-03  8.7338570e-04  7.4907029e-03 -6.0740625e-03
  5.1605068e-03  9.9228229e-03 -8.4573915e-03 -5.1356913e-03
 -7.0648370e-03 -4.8626517e-03 -3.7785638e-03 -8.5361991e-03
  7.9556061e-03 -4.8439382e-03  8.4236134e-03  5.2625705e-03
 -6.5500261e-03  3.9578713e-03  5.4701497e-03 -7.4265362e-03
 -7.4057197e-03 -2.4752307e-03 -8.6257253e-03 -1.5815723e-03
 -4.0343284e-04  3.2996845e-03  1.4418805e-03 -8.8142155e-04
 -5.5940580e-03  1.7303658e-03 -8.9737179e-04  6.7936908e-03
  3.9735902e-03  4.5294715e-03  1.4343059e-03 -2.6998555e-03
 -4.3668128e-03 -1.0320747e-03  1.4370275e-03 -2.6460087e-03
 -7.0737829e-03 -7.8053069e-03 -9.1217868e-03 -5.9351693e-03
 -1.8474245e-03 -4.3238713e-03 -6.4606704e-03 -3.7173224e-03
  4.2891586e-03 -3.7390434e-03  8.3781751e-03  1.5339935e-03
 -7.242

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from enum import Enum
from typing import List, Iterable, Optional, Dict
import demoji
# from transliterate import translit

In [2]:
df = pd.read_csv("data.csv")

In [61]:
class ExtraData(Enum):
    plus = "–ü–ª—é—Å—ã, –æ—Ç–º–µ—á–µ–Ω–Ω—ã–µ –∞–≤—Ç–æ—Ä–æ–º"
    minus = "–ú–∏–Ω—É—Å—ã, –æ—Ç–º–µ—á–µ–Ω–Ω—ã–µ –∞–≤—Ç–æ—Ä–æ–º"
    autor_review = "–æ—Ü–µ–Ω–∫–∞ –∞–≤—Ç–æ—Ä–∞"
    all_review = "–æ—Ü–µ–Ω–∫–∞ —Å—Ä–µ–¥–Ω–µ–≤–∑–≤–µ—à–µ–Ω–Ω–∞—è –ø–æ –≤—Å–µ–º —Ä–µ–≤—å—é"

def parce_short_structured_description(data, get: ExtraData=ExtraData.plus):
    result = {ExtraData.plus: None, ExtraData.minus: None, ExtraData.autor_review: {}, ExtraData.all_review: {}}
    soup = BeautifulSoup(data, "html.parser")
    features = soup.find_all('div', class_="ReviewProsAndCons__features")[1]
    for feature in features.find_all('div', class_="ReviewProsAndCons__summary__text"):
        if feature.find('svg', class_="ReviewProsAndCons__summary__icon_plus"):
            result[ExtraData.plus] = feature.text
            continue
        if feature.find('svg', class_="ReviewProsAndCons__summary__icon_minus"):
            result[ExtraData.minus] = feature.text
            continue
    
    autor_review = soup.find_all("div", class_="ReviewProsAndCons__summary__content")[0]
    autor_review = autor_review.find_all('li', class_="ReviewProsAndCons__summary__item")
    for mark in autor_review:
        rewiew_category = mark.find("span", class_="ReviewProsAndCons__summary__name").text
        rewier_mark = mark.find("span", class_="ReviewProsAndCons__summary__value").text
        result[ExtraData.autor_review][rewiew_category] = rewier_mark

    all_review = soup.find_all("div", class_="ReviewProsAndCons__summary__content")[1]
    all_review = all_review.find_all('li', class_="ReviewProsAndCons__summary__item")
    for mark in all_review:
        rewiew_category = mark.find("span", class_="ReviewProsAndCons__summary__name").text
        rewier_mark = mark.find("span", class_="ReviewProsAndCons__summary__value").text
        result[ExtraData.all_review][rewiew_category] = rewier_mark
    return result[get]


df["short_structured_description"].apply(parce_short_structured_description, get=ExtraData.all_review)

0      {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5,0', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,6', '–ë–µ–∑–æ...
1      {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5,0', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,6', '–ë–µ–∑–æ...
2      {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5,0', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,6', '–ë–µ–∑–æ...
3      {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5,0', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,6', '–ë–µ–∑–æ...
4      {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5,0', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,6', '–ë–µ–∑–æ...
                             ...                        
480    {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,5', '–ë–µ–∑–æ–ø–∞...
481    {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,5', '–ë–µ–∑–æ–ø–∞...
482    {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,5', '–ë–µ–∑–æ–ø–∞...
483    {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,5', '–ë–µ–∑–æ–ø–∞...
484    {'–í–Ω–µ—à–Ω–∏–π –≤–∏–¥': '5', '–ö–æ–º—Ñ–æ—Ä—Ç': '4,5', '–ë–µ–∑–æ–ø–∞...
Name: short_structured_description, Length: 485, dtype: object

In [62]:
def avg(x: List[float]) -> Optional[float]:
    x = list(x)
    if len(x) == 0:
        return None
    return round(sum(x) / len(x), 2) 

def to_int(x: Iterable[str]) -> List[float]:
    result = []
    for i in x:
        try:
            if "," in i:
                i = i.replace(",", '.')
            i = float(i)
        except ValueError as e:
            continue
        result += [i]
    return result 

df[ExtraData.plus.value] = df["short_structured_description"].apply(parce_short_structured_description, get=ExtraData.plus)
df[ExtraData.minus.value] = df["short_structured_description"].apply(parce_short_structured_description, get=ExtraData.minus)
df[ExtraData.autor_review.value] = df["short_structured_description"].apply(parce_short_structured_description, get=ExtraData.autor_review)
df[ExtraData.all_review.value] = df["short_structured_description"].apply(parce_short_structured_description, get=ExtraData.all_review)
df["—Å—Ä–µ–ª–Ω—è—è –ø–æ –≤—Å–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º –æ—Ü–µ–Ω–∏–≤–∞–Ω–∏—è –æ—Ü–µ–Ω–∫–∞ –∞–≤—Ç–æ—Ä–∞"] = df["short_structured_description"].apply(parce_short_structured_description, get=ExtraData.autor_review).apply(lambda x: avg(to_int(x.values())))
df["—Å—Ä–µ–ª–Ω—è—è –ø–æ –≤—Å–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º –æ—Ü–µ–Ω–∏–≤–∞–Ω–∏—è –æ—Ü–µ–Ω–∫–∞ —Å—Ä–µ–¥–Ω–µ–≤–∑–≤–µ—à–µ–Ω–Ω–∞—è –ø–æ –≤—Å–µ–º —Ä–µ–≤—å—é"] = df["short_structured_description"].apply(parce_short_structured_description, get=ExtraData.all_review).apply(lambda x: avg(to_int(x.values())))

In [63]:
class UserInfo(Enum):
    user_info_year = "–≥–æ–¥, –∫–æ–≥–¥–∞ —Ä–µ–≤—å—é–µ—Ä —Å–æ–∑–¥–∞–ª –∞–∫–∫–∞—É–Ω—Ç"
    nick = "–Ω–∏–∫\–∏–º—è —Ä–µ–≤—å—é–µ—Ä–∞"
    link = "—Å—Å—ã–ª–∫–∞ –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü—É –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è"

def parce_user_info(data, get: UserInfo=UserInfo):
    result = {UserInfo.user_info_year: None, UserInfo.nick: None, UserInfo.link: None}
    soup = BeautifulSoup(data, "html.parser")
    result[UserInfo.user_info_year] = soup.find("div", class_="Review__userInfoYear").text
    try:
        user_info = soup.find("a", class_="Link")
        result[UserInfo.nick] = user_info.text
        result[UserInfo.link] = user_info.get("href")
    except AttributeError as e:
        pass
    return result[get]


df["user_info"].apply(parce_user_info, get=UserInfo.nick)

0                  –ê–ª–µ–∫—Å–µ–π
1                  –î–º–∏—Ç—Ä–∏–π
2      –ö–æ—Ä–∏—á–Ω–µ–≤—ã–π –≥—Ä—É–∑–æ–≤–∏–∫
3            KobayashiMaru
4                  –î–º–∏—Ç—Ä–∏–π
              ...         
480                 –ù–∏–∫–æ–ª—å
481        –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –®–æ—à–∏–Ω
482                –ê–Ω–∑–æ—Ä–∏–∫
483     –í–∞—Å–∏–ª–µ–≤—Å–∫–∏–π –ù–∏–∫–∏—Ç–∞
484                  –û–ª—å–≥–∞
Name: user_info, Length: 485, dtype: object

In [64]:
df[UserInfo.user_info_year.value] = df["user_info"].apply(parce_user_info, get=UserInfo.user_info_year)
df[UserInfo.nick.value] = df["user_info"].apply(parce_user_info, get=UserInfo.nick)
df[UserInfo.link.value] = df["user_info"].apply(parce_user_info, get=UserInfo.link)

In [65]:
df.drop(columns=['short_structured_description', 'user_info'], inplace = True)
df.rename(columns={"Unnamed: 0":"‚Ññ"}, inplace=True)

In [75]:
all_emijis = demoji.findall(" ".join(df["main_text"]))

In [76]:
all_emijis

{'üö´': 'prohibited',
 'üöÄ': 'rocket',
 'ü§£': 'rolling on the floor laughing',
 'üëé': 'thumbs down',
 'üòè': 'smirking face',
 'üíØ': 'hundred points',
 'üòÄ': 'grinning face',
 'üí£': 'bomb',
 'üëç': 'thumbs up',
 '‚úåÔ∏è': 'victory hand',
 '‚ò∫Ô∏è': 'smiling face',
 'üòÇ': 'face with tears of joy',
 'üçã': 'lemon',
 'üò≠': 'loudly crying face',
 'üòÑ': 'grinning face with smiling eyes',
 'üî•': 'fire',
 'üòé': 'smiling face with sunglasses',
 'üòÜ': 'grinning squinting face',
 'üòú': 'winking face with tongue',
 'üòÅ': 'beaming face with smiling eyes',
 'üò≥': 'flushed face',
 'üëè': 'clapping hands',
 'üòâ': 'winking face',
 'üëçüèª': 'thumbs up: light skin tone',
 'ü§≠': 'face with hand over mouth',
 'üò¢': 'crying face',
 'üöï': 'taxi',
 'üëå': 'OK hand',
 'üòá': 'smiling face with halo',
 'ü§¶üèΩ\u200d‚ôÄÔ∏è': 'woman facepalming: medium skin tone',
 'üòÖ': 'grinning face with sweat',
 'ü§î': 'thinking face'}

In [82]:
emijis_change = {
 'üö´': "–∑–∞–ø—Ä–µ—â–µ–Ω–Ω–æ",
 'üöÄ': "—Ä–∞–∫–µ—Ç–∞",
 'ü§£': "—Å–º–µ—à–Ω–æ –¥–æ —Å–ª–µ–∑",
 'üëé': "–æ—Ç—Å—Ç–æ–π",
 'üòè': "—É—Ö–º—ã–ª–∫–∞",
 'üíØ': "—Å—Ç–æ –±–∞–ª–ª–æ–≤",
 'üòÄ': "—É—Ö–º—ã–ª–∫–∞",
 'üí£': "–±–æ–º–±–∞",
 'üëç': "–∫—Ä—É—Ç–æ",
 '‚úåÔ∏è': "–∫–ª–∞—Å",
 '‚ò∫Ô∏è': "—É–ª—ã–±–∫–∞",
 'üòÇ': "—Ä–∞–¥–æ—Å—Ç–Ω–æ",
 'üçã': "–ª–µ–º–æ–Ω",
 'üò≠': "–≥—Ä—É—Å—Ç–Ω–æ",
 'üòÑ': "—Å–º–µ—à–Ω–æ",
 'üî•': "–∫–ª–∞—Å",
 'üòé': "–∫—Ä—É—Ç–æ",
 'üòÜ': "—Å–º–µ—à–Ω–æ",
 'üòú': '',
 'üòÅ': "—É–ª—ã–±–∫–∞",
 'üò≥': "—Å—Ç—Ä–∞–Ω–Ω–æ",
 'üëè': "–±—Ä–∞–≤–æ",
 'üòâ': '',
 'üëçüèª': "—Ö–æ—Ä–æ—à–æ",
 'ü§≠': "–∑–∞–±–∞–≤–Ω–æ",
 'üò¢': "–≥—Ä—É—Å—Ç–Ω–æ",
 'üöï':  "—Ç–∞–∫—Å–∏",
 'üëå': "—Ö–æ—Ä–æ—à–æ",
 'üòá': "—É–ª—ã–±–∫–∞",
 'ü§¶üèΩ\u200d‚ôÄÔ∏è': "–≥–ª—É–ø–æ—Å—Ç—å",
 'üòÖ': 'grinning face with sweat',
 'ü§î': 'thinking face'}

In [97]:
def replace_emoji(s:str, replace_map: Dict[str, str]) -> str:
    result = ""
    emojis = set(replace_map.keys())
    for char in s:
        if char in emojis:
            result += " " + replace_map[char] + " "
        else:
            result += char
    result = result.replace("  ", " ")
    return result

# df["main_text"].apply(replace_emoji, replace_map=emijis_change).loc[12]

In [96]:
df["main_text_without_emoji"] =  df["main_text"].apply(replace_emoji, replace_map=emijis_change)

In [91]:
df["main_text"].loc[12]

'–¢–∞—á–∫–∞ –æ–≥–æ–Ω—å üöÄüí£. –î–∏–∑–∞–π–Ω–µ—Ä –ø–æ—Å—Ç–∞—Ä–∞–ª—Å—è. –ó–∞ 7 –¥–Ω–µ–π –æ–¥–Ω–∏ –≤–æ—Å—Ö–∏—â–µ–Ω–∏—è, —Å –∫–∞–∂–¥—ã–º –¥–Ω—ë–º –≤—Å–µ –Ω–æ–≤—ã–µ –∏ –Ω–æ–≤—ã–µ. üëèüëèüëè–û–¥–Ω–æ —Ç–æ–ª—å–∫–æ —Ä–∞–∑–¥—Ä–∞–∂–∞–µ—Ç, –≤ –∑–∞–¥–Ω—é—é –ø–æ–ª–∫—É –ø–æ—Å—Ç–∞–≤–ª–µ–Ω —Å–∞–±–≤—É—Ñ–µ—Ä Bose (–≤ –∂–µ–ª–µ–∑–æ), –∞ —Å–≤–µ—Ä—Ö—É –Ω–∞–∫—Ä—ã—Ç –ø–ª–∞—Å—Ç–∏–∫–æ–≤–æ–π –ø–æ–ª–∫–æ–π –∫–æ—Ç–æ—Ä–∞—è –∫–∞–ø–µ—Ü –∫–∞–∫ –≥—Ä–µ–º–∏—Ç. –ó–∞—á–µ–º –ø–æ—Å—Ç–∞–≤–∏–ª–∏ —Å–∏—Å—Ç–µ–º—É –±–æ—Å—Å, –µ—Å–ª–∏ —Å–ª—É—à–∞—Ç—å —Å —Ç–∞–∫–æ–π –ø–æ–ª–∫–æ–π –Ω–µ–≤–æ–∑–º–æ–∂–Ω–æüòÜüòÜüòÜ. –≠—Ç–æ –∑–∞ 2,15‚ÇΩ ü§≠ –≤—Ä–µ–º–µ–Ω–Ω–æ –ø–æ–¥–ª–µ—á–∏–ª –≤—Å—Ç–∞–≤–∏–≤ –º–µ–∂–¥—É –∑–∞–¥–Ω–∏–º —Å—Ç–µ–∫–ª–æ–º –∏ –ø–æ–ª–∫–æ–π —Å–≤–µ—Ä–Ω—É—Ç–æ–µ –º–∞–ª–µ–Ω—å–∫–æ–µ –ø–æ–ª–æ—Ç–µ–Ω—Ü–µ üòÇü§£üòÇ–ü–æ—Å–ª–µ –º–æ–π–∫–∏ –∏ –ø—Ä–æ—Ç–∏—Ä–∫–∏, –ø—Ä–∏ –µ–∑–¥–µ –∏–∑ —â–µ–ª–µ–π –ø–æ–ª—å—ë—Ç—Å—è –≤–æ–¥–∞. –ú–Ω–æ–≥–æ —â–µ–ª–µ–π –∏–∑ –∑–∞ –¥–µ–∫–æ—Ä–∞—Ü–∏–π. –î–ª—è –¥–∏–ª–µ—Ç–∞–Ω—Ç–æ–≤ –∏–∑ –∫–æ–º–µ–Ω—Ç–æ–≤. 1.  –†–∞—Å—Ö–æ–¥ –ø–æ –ø—

In [99]:
df.to_excel('data2.xlsx', index = False)

In [11]:
df[["mark", "model"]].drop_duplicates().values.tolist()

[['Kia', 'K5'],
 ['Kia', 'Stinger'],
 ['Toyota', 'Corolla'],
 ['Toyota', 'Camry'],
 ['Skoda', 'Octavia'],
 ['Skoda', 'Superb'],
 ['Volkswagen', 'Passat (North America and China)'],
 ['Honda', 'Accord'],
 ['Audi', 'A3'],
 ['Hyundai', 'Elantra'],
 ['Mazda', '6'],
 ['JAC', 'J7'],
 ['OMODA', 'S5'],
 ['Chery', 'Arrizo 8'],
 ['Changan', 'UNI-V']]

In [23]:
df3 = pd.read_csv("data3.csv")

In [22]:
df2 = pd.read_csv("data2.csv")

In [25]:
df1 = pd.read_excel("data2.xlsx")

In [29]:
# xlwriter = pd.ExcelWriter('output.xlsx')
# df1.to_excel(xlwriter, sheet_name='–∞–≤—Ç–æ—Ä—É') 
# df2.to_excel(xlwriter,sheet_name='–¥—Ä–æ–º')
# df3.to_excel(xlwriter,sheet_name='–∞–≤–∏—Ç–æ')
# xlwriter.close()