# Подготовка данных

In [51]:
from sklearn.model_selection import train_test_split 
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import collections

from argparse import Namespace

In [39]:
tqdm.pandas()

  from pandas import Panel


Запишем параметры для возможной работы из командной строки для быстрого преобразования данных

In [40]:
args = Namespace(
    raw_train_dataset_csv="data/yelp/raw_train.csv",
    raw_test_dataset_csv="data/yelp/raw_test.csv",
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_csv="data/yelp/reviews_with_splits_full.csv",
    seed=1337
)

Прочтем обучающие данные:

In [41]:
train_reviews = pd.read_csv(args.raw_train_dataset_csv, 
                            header=None, 
                            names=['rating', 'review'])

In [42]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [43]:
train_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560000 entries, 0 to 559999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   rating  560000 non-null  int64 
 1   review  560000 non-null  object
dtypes: int64(1), object(1)
memory usage: 8.5+ MB


In [44]:
# если вдруг появятся пустые комменты
train_reviews = train_reviews[~pd.isnull(train_reviews.review)]

Прочтем тестовые данные:


In [45]:
test_reviews = pd.read_csv(args.raw_test_dataset_csv, header=None, names=['rating', 'review'])

In [46]:
test_reviews.head()

Unnamed: 0,rating,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [47]:
test_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38000 entries, 0 to 37999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rating  38000 non-null  int64 
 1   review  38000 non-null  object
dtypes: int64(1), object(1)
memory usage: 593.9+ KB


In [48]:
# если вдруг появятся пустые комменты
test_reviews = test_reviews[~pd.isnull(test_reviews.review)]

In [49]:
# уникальные классы
train_reviews['rating'].unique()

array([1, 2], dtype=int64)

In [13]:
list(train_reviews.iterrows())

<generator object DataFrame.iterrows at 0x00000000121BD248>

In [53]:
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())

In [71]:
# создаем разделение данных
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    
    # создадим размеры выборок
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    
    # добавим метки для валидационной и обучающей выборки
    for item in item_list[:n_train]:
        item['split'] = 'train'
        
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    final_list.extend(item_list)

In [74]:
for _, row in test_reviews.iterrows():
    row_dict = row.to_dict()
    row_dict['split'] = 'test'
    final_list.append(row_dict)

Посмотрим на разделенные на выборки данные и сохраним их в файл

In [75]:
final_reviews = pd.DataFrame(final_list)

In [76]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,1,Manuel's is better than your Serrano's or Garc...,train
1,1,"Yes folks, the chicken is good when they have ...",train
2,1,Horrible service in every area today. We waite...,train
3,1,If sweaty bodies turn you on\nAnd you enjoy be...,train
4,1,What kind of dealership says it's illegal to b...,train


In [77]:
final_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598000 entries, 0 to 597999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   rating  598000 non-null  int64 
 1   review  598000 non-null  object
 2   split   598000 non-null  object
dtypes: int64(1), object(2)
memory usage: 13.7+ MB


In [78]:
final_reviews.split.value_counts()

train    392000
val      168000
test      38000
Name: split, dtype: int64

In [None]:
# преобразуем текст
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r'([.,!?])', r' \1 ', text)
    text = re.sub(r'[^a-zA-Z.,!?]+', r' ', text)
    return text

In [79]:
final_reviews.review = final_reviews.review.progress_apply(preprocess_text)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 598000/598000 [00:41<00:00, 14299.25it/s]


In [80]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,1,manuel s is better than your serrano s or garc...,train
1,1,"yes folks , the chicken is good when they have...",train
2,1,horrible service in every area today . we wait...,train
3,1,if sweaty bodies turn you on nand you enjoy be...,train
4,1,what kind of dealership says it s illegal to b...,train


In [81]:
# сохраним в файл
final_reviews.to_csv(args.output_munged_csv, index=False)

In [1]:
final_reviews

NameError: name 'final_reviews' is not defined