In [1]:
!pip install pymorphy3

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import pymorphy3
import warnings

from dateutil import parser
from torch.utils.data import DataLoader, random_split, Dataset
from sklearn.metrics import f1_score, roc_auc_score
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from collections import defaultdict, OrderedDict

In [4]:
def preprocess_triplets(res_df, date_df, triplets_columns="triplets", date_column="timestamp", morph=True):
  if morph: morph_parser = pymorphy3.MorphAnalyzer()
  triplets = res_df[triplets_columns].values.tolist()

  dates = date_df[date_column].values.tolist()
  all_triplets = []
  for i, (text, label_ix, label) in enumerate(triplets):
      all_triplets.extend([[text] + list(item) + [list(item_ix)[0]] + [dates[i]] for item_ix, item in zip(eval(label_ix), eval(label))])

  data = []
  if not morph:
     for triplet in tqdm(all_triplets):
        txt, at, ot, sp, (at_ix), ts = triplet
        at = at.lower()
        ot = ot.lower()
        data.append([ts, txt, at, at_ix, ot, sp, (at, ot, sp), (at, ot)])
  else:
    for triplet in tqdm(all_triplets):
        txt, at, ot, sp, (at_ix), ts = triplet
        at = morph_parser.parse(at.lower())[0].normal_form
        ot = ot.lower()
        data.append([ts, txt, at, at_ix, ot, sp, (at, ot, sp), (at, ot)])
  data = pd.DataFrame(data)
  data.columns = ['date', "text", 'aspect', "aspect_ix", 'opinion', 'sentiment', 'triplet', 'aspect_opinion']

  data["timestamp"] = data["date"].apply(lambda x: parser.isoparse(x).timestamp())

  return data

In [57]:
df = pd.read_csv("data/results_sravni (2).csv")
data = preprocess_triplets(df, df, ["text", "pred", "pred_text"], "time")

100%|██████████| 48214/48214 [00:07<00:00, 6649.92it/s]


In [58]:
df = pd.read_csv("data/1k_selling_pandas.csv")

In [59]:
df["json_response"].value_counts()

Информативно, Обслуживание                                                                                                                                                             551
Информативно, Общее впечатление о банке                                                                                                                                                133
Информативно, Банковские условия                                                                                                                                                        97
"Информативно, Обслуживание"                                                                                                                                                            70
Информативно, Интерфейс                                                                                                                                                                 40
"Информативно, Общее впечатление о банке"                        

In [60]:
def sep_0(s):
    s = s.replace("'", "").replace('"', "").replace("Ответ: ", "").replace("аспект: ответ\nотзыв: ", "").replace("Твой ответ: ", "").split(", ")
    return s[0]

def sep_1(s):
    s = s.replace("'", "").replace('"', "").replace("Ответ: ", "").replace("Твой ответ: ", "").split(", ")
    if len(s) == 1:
        # print(s)
        return "-1"
    else:
        if s[1] not in list(label2id.keys()): return "-2"
        else: return s[1]
    
label2id = {
    "Обслуживание" : 0, 
    "Банковские условия" : 1, 
    "Общее впечатление о банке" : 2, 
    "Функционал" : 3,
    "Интерфейс" : 4,
    "Конкуренты" : 5,
    "-1" : 6,
    "-2" : 7
}
n = 1000
data = data[:n]
data["info"] = df["json_response"].apply(sep_0).map({"Информативно" : 1, "Неинформативно" : 0})
data["cat"] = df["json_response"].apply(sep_1).map(label2id)

In [67]:
{item : key for key, item in label2id.items()}

{0: 'Обслуживание',
 1: 'Банковские условия',
 2: 'Общее впечатление о банке',
 3: 'Функционал',
 4: 'Интерфейс',
 5: 'Конкуренты',
 6: '-1',
 7: '-2'}

In [61]:
data["cat"].unique()

array([0, 1, 6, 2, 3, 4, 5, 7], dtype=int64)

In [62]:
data.drop(data[data.cat == 7].index, inplace=True)

In [64]:
data.to_parquet("1k_all.parquet", index=False)

In [53]:
len(data)

997

In [120]:
sep_0(df.loc[944, :]["json_response"])

'аспект: ответ\nотзыв: Информативно'