# Иллюстрация загрузки и подготовки данных Amazon

### $\textbf{Содержание}$:

### $\textbf{I. Загрузка данных}$
### Данные загружаются из сети в форме .gz архива.
------

### $\textbf{II. Очистка данных}$
### Из записей в txt файле для каждой записи извлекаются атрибуты и преобразуются в следующие признаки: 
#### - $\it{item\_id} \in \mathbb{N}$: идентификатор предложения; 
#### - $\it{user\_id} \in \mathbb{N}$: идентификатор пользователя;
#### - $\it{helpfulness} \in \mathbb{R}^+$: полезность предложения для пользователя;
#### - $\it{score} \in [1, 5]$: полезность предложения для пользователя;
#### - $\it{timestamp} \in \mathbb{N}$: временная метка;
#### - $\it{review}$: полный текст отзыва клиента на предложение;
------

### I. Загрузка данных

In [1]:
import pandas as pd
import numpy as np

import requests
import gzip
from tqdm.notebook import tqdm
import re

In [2]:
archives_urls = "http://snap.stanford.edu/data/amazon/all.txt.gz"
file_name = "all.txt"
file_name_parquet = "all.parquet"

In [3]:
def save_archive(
    url, 
    local_filename
) -> None:
    """
    The function reads data from the Internet and save as archive
    :param url: url addres
    :type url: str
    :param local_filename: filename for data to be saved
    :type local_filename: str
    """
    req = requests.get(url, stream=True)
    
    print(url)
    with open(local_filename, 'wb') as f:
        for chunk in tqdm(req.iter_content(chunk_size=1024)):
            if chunk:
                f.write(chunk)
                f.flush()
    print("Done")

In [4]:
save_archive(archives_urls, file_name + r".gz")

http://snap.stanford.edu/data/amazon/all.txt.gz


0it [00:00, ?it/s]

Done


### II. Очистка данных

In [3]:
def helpfulness_procc(x):
    """
    The function for 'helpfulness' attribute processing
    :param x: helpfulness value
    :type x: str
    :return: numeric helpfulness
    :rtype: float
    """
    result = re.search(r"(\d+)/(\d+)", x)
    try:
        i, j = float(result.group(1)), float(result.group(2))
    except:
        i, j = None, None
        
    if i == None:
        return 0
    
    if i == 0 or j == 0:
        return 0
    
    return round(i/j, 3)

In [4]:
def clean_data(
    file_name: str,
    chunck_size: int = 10000
) -> pd.DataFrame:
    
    with open(file_name) as f:
        lines = f.readlines()
    print(f"Rows count: {len(lines)}")
    
    item_id = lines[0::11]
    user_id = lines[3::11]
    helpfulness = lines[5::11]
    score = lines[6::11]
    timestamp = lines[7::11]
    summary = lines[8::11]
    text = lines[9::11]
    
    #### Преобразование данных
    df = pd.DataFrame({"item_id": item_id,
                   "user_id": user_id,
                   "helpfulness": helpfulness,
                   "score": score,
                   "timestamp": timestamp,
                   "summary": summary,
                   "text": text})
    
    df_res = pd.DataFrame()
    for i in range(int(np.ceil(df.shape[0] / chunck_size))):
        chunck = df.iloc[i*chunck_size:(i+1)*chunck_size, :]
    
        chunck.item_id = chunck.item_id.apply(lambda x: x[19:-1])
        chunck.user_id = chunck.user_id.apply(lambda x: x[15:-1])
        chunck.helpfulness = chunck.helpfulness.apply(lambda x: x[20:-1])
        chunck.score = chunck.score.apply(lambda x: x[14:-1])
        chunck.timestamp = chunck.timestamp.apply(lambda x: x[13:-1])
        chunck.summary = chunck.summary.apply(lambda x: x[16:-1])
        chunck.text = chunck.text.apply(lambda x: x[13:-1])

        chunck.score = chunck.score.astype(float)
        chunck.timestamp = chunck.timestamp.astype(int)
        chunck = chunck[(chunck.item_id != "unknown") & (chunck.user_id != "unknown")]
        
        df_res = df_res.append(chunck, ignore_index=True)

    #### Обработка категориальных признаков
    df_ = df_res.copy()
    df_.item_id = df_.item_id.astype('category').cat.codes
    df_.user_id = df_.user_id.astype('category').cat.codes
    
    #### Обработка признака helpfulness
    df_.helpfulness = df_.helpfulness.apply(lambda x: helpfulness_procc(x))
    
    #### Обработка текстовых признаков (объединение в общий текст)
    df_["review"] = df_["summary"] + " " + df_["text"]
    df_ = df_.drop(["summary", "text"], axis=1)
    
    return df_

In [9]:
df = clean_data(file_name)
print(f"Datafraem shape: {df.shape}")
df.head()

Rows count: 381554470
DataFrame shape: (29667966, 6)


Unnamed: 0,item_id,user_id,helpfulness,score,timestamp,review
0,1208658,4780343,1.0,4.0,1182816000,Periwinkle Dartmouth Blazer I own the Austin R...
1,1904404,5567478,0.0,5.0,1262304000,Great fun! Got these last Christmas as a gag g...
2,1904404,4662915,0.0,3.0,1224633600,more like funchuck Gave this to my dad for a g...
3,537382,6416765,1.0,4.0,940636800,Nice collection of Julie Strain images This is...
5,812832,2763196,0.0,5.0,1332288000,Great CD My lovely Pat has one of the GREAT vo...


In [None]:
df.to_parquet(file_name_parquet)