# Иллюстрация предобработки признаков для датасета Netflix
<blockquote>
    <p>Показанная ниже генерация признаков вынесена в <a href="https://github.com/AlgoMathITMO/sber-simulator/blob/experiments-new/experiments/Netflix/netflix.py" title="netflix.py">файл</a>, 
        и выполняется вместе с разбиение данных для последующего применения в экспериментах в 
        <a href="https://github.com/AlgoMathITMO/sber-simulator/blob/experiments-new/experiments/Netflix/train_test_split.ipynb" title="train_test_split">ноутбуке</a>.</p>
</blockquote>

### $\textbf{Содержание}$:


### $\textbf{I. Генерация признаков}$
### Из названий фильмов генерируются следующие признаки фильмов:
#### - среднее арифметическое векторных представлений названия фильма $\it{w2v}=\{\it{w2v}_i\}_{i=0}^{299} \in \mathbb{R}^{300}$;

### Из рейтингов фильмов генерируются следующие признаки:
#### - количество оцененных пользователем фильмов $\it{rating\_cnt} \in \mathbb{N} \cup \{0\}$;

### Из признаков фильмов $\{\it{w2v}\, \it{rating\_avg}\}$ генерируются следующие признаки пользователей:
#### - среднее арифметическое векторных представлений названий фильмов, оцененных пользователем $\it{w2v}=\{\it{w2v}_i\}_{i=0}^{299} \in \mathbb{R}^{300}$;
#### - средняя оценка фильмов, с которыми взаимодействовал пользователь $\it{rating\_avg} \in [0, 5]$;

In [1]:
import pandas as pd
import numpy as np

import re
from datetime import datetime

import tqdm

In [None]:
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer

import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')
words = set(nltk.corpus.words.words())
words = set([w.lower() for w in words])

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("wordnet")

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))


from nltk.tokenize import sent_tokenize

In [None]:
import gensim
from gensim.downloader import load
from gensim.models import Word2Vec

w2v_model = gensim.downloader.load('word2vec-google-news-300')

### I. Генерация признаков

In [None]:
movies_path = r'./data_clean/movies.csv'
ratings_path = r'./data_clean/rating.csv'

In [38]:
df_movies = pd.read_csv(movies_path)
df_rating = pd.read_csv(ratings_path)

In [27]:
def clean_text(text: str) -> str:
    """
    Cleaning text: remove extra spaces and non-text characters
    :param text: tag row text
    :type title: str
    :return: tag cleaned text
    :rtype: str
    """

    text = re.sub("[^a-zA-Z]", " ",text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+$", "", text)
    text = re.sub(r"^\s+", "", text)
    text = text.lower()

    return text


def procces_text(text):
    """
    Processing text: lemmatization, tokenization, removing stop-words
    :param text: tag cleaned text
    :type text: str
    :return: tag processed text
    :rtype: str
    """
    lemmatizer = WordNetLemmatizer() 

    text = [word for word in nltk.word_tokenize(text) if not word in stop_words]
    text = [lemmatizer.lemmatize(token) for token in text]
    text = [word for word in text if word in words]

    text = " ".join(text)
    
    return text

def string_embedding(string: str) -> np.ndarray:
    """"
    Processing text: lemmatization, tokenization, removing stop-words
    :param string: cleaned and processed tags
    :type string: str
    :return: average vector of the string words embeddings
    :rtype: np.ndarray, optional
    """
    
    arr = string.split(' ')
    vec = 0
    cnt = 0
    for i in arr:
        try:
            vec += w2v_model[i]
            cnt += 1
        except:
            pass
    if cnt == 0:
        vec = np.zeros((300, 1))
    else:
        vec /= cnt
    return vec

def group_w2v(history: pd.DataFrame, movies: pd.DataFrame) -> pd.DataFrame:
    """"
    Aggregate embedded data for users partitions.
    :param history: users movies history data .
    :type history: pd.DataFrame
    :param movies: movies data.
    :type movies: pd.DataFrame
    :return: average vector of the string words embeddings
    :rtype: np.ndarray, optional
    """
    
    """
    Aggregation (mean) embedded data for users watch history partitions.
    
    Arguments:
    --history: data frame of users movies history.
    --movies: data frame of movies. 
    
    Return:
    --df: data frame of users with aggregation embedded movies data.
    """
    users_id_arr = history.user_Id.unique()
    
    id_arr = []
    vec_arr = np.zeros((len(users_id_arr), 300))
    
    for user_id in tqdm.tqdm_notebook(range(len(users_id_arr))):
        vec = np.asarray(movies[movies.movie_Id.isin(history[history.user_Id == users_id_arr[user_id]].movie_Id)].iloc[:, 4:]).mean(axis=0) 
        
        id_arr.append(users_id_arr[user_id])
        vec_arr[user_id] = vec
    
    df = pd.DataFrame(vec_arr)
    df['user_Id'] = id_arr
    
    return df

In [32]:
df_movies['clean_title'] = df_movies.title.apply(lambda x : procces_text(clean_text(x)))
df_movies.drop("title", axis = 1, inplace = True)
df_movies.head()

Unnamed: 0,movie_Id,rating_cnt,rating_avg,year,clean_title
0,1,547,3.749543,2003,dinosaur planet
1,2,145,3.558621,2004,isle man review
2,3,2012,3.641153,1997,character
3,4,142,2.739437,1994,paula get dance
4,5,1140,3.919298,2004,rise fall


Генерация признаков фильмов

In [33]:
df_movies_clean = pd.concat([df_movies.drop("clean_title", axis=1), 
                                 pd.DataFrame(df_movies.clean_title.apply(string_embedding).to_list(), columns = ['w2v_' + str(i) for i in range(300)])], axis = 1)
df_movies_clean.head()

Unnamed: 0,movie_Id,rating_cnt,rating_avg,year,w2v_0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,...,w2v_290,w2v_291,w2v_292,w2v_293,w2v_294,w2v_295,w2v_296,w2v_297,w2v_298,w2v_299
0,1,547,3.749543,2003,0.0791626,0.141602,0.0653076,0.144043,-0.169922,-0.00537109,...,-0.170227,0.147827,-0.256836,0.119385,0.0618286,-0.0197067,-0.0344849,0.204407,0.0576172,-0.0251923
1,2,145,3.558621,2004,-0.0749512,0.0182292,-0.084554,0.0145671,0.0375163,-0.0357259,...,0.108398,-0.0535482,-0.0673828,0.0973307,-0.023112,-0.0393066,-0.146362,-0.221191,-0.023763,0.0696615
2,3,2012,3.641153,1997,0.257812,-0.0258789,-0.00357056,0.0163574,-0.0544434,0.289062,...,-0.180664,0.208984,-0.235352,-0.283203,-0.188477,0.0142822,0.143555,-0.0393066,-0.120605,0.041748
3,4,142,2.739437,1994,0.0388997,-0.162272,-0.00537109,0.194661,-0.0142822,-0.00219727,...,-0.00626628,0.115234,-0.116211,-0.00423177,0.034078,0.0113932,0.0836182,-0.107096,0.0487976,0.0219727
4,5,1140,3.919298,2004,-0.00842285,0.0577393,-0.145508,0.188477,-0.142578,-0.101318,...,0.00756836,-0.0305786,-0.166504,-0.0996094,-0.0799561,-0.325195,-0.132568,0.020874,0.1427,0.0246277


Генерация признаков пользователей

In [35]:
movies_vector = df_movies_clean.drop(['year'], axis=1)
for col in movies_vector.drop("movie_Id", axis=1).columns:
    movies_vector[col] = movies_vector[col].astype('float')

agg_columns = []
df_result = pd.DataFrame()

In [39]:
chunksize=10000
chunk_count = (df_rating.shape[0] // chunksize) + 1 if df_rating.shape[0]%chunksize!=0 else df_rating.shape[0] // chunksize
for idx in tqdm.tqdm_notebook(range(chunk_count)):
    chunk = df_rating.iloc[idx*chunksize:(idx+1)*chunksize, :]
    df_history = pd.merge(chunk[['user_Id', 'movie_Id', 'rating']], movies_vector.movie_Id, on = 'movie_Id', how = 'left')
    df_history = pd.merge(df_history, movies_vector, how='left', on='movie_Id').drop('movie_Id', axis=1)
    df_history['cnt'] = 1

    if idx == 0:
        agg_columns = df_history.drop(['user_Id'], axis=1).columns
    df_history_aggregated = df_history.groupby("user_Id", as_index=False)[agg_columns].sum()
    df_result = df_result.append(df_history_aggregated, ignore_index=True)

    if idx % 20 == 0:
        df_result = df_result.groupby("user_Id", as_index=False)[agg_columns].sum()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/10049 [00:00<?, ?it/s]

In [40]:
df_result = df_result.groupby("user_Id", as_index=False)[agg_columns].sum()
for col in agg_columns:
    if col != "cnt":
        df_result[col] = df_result[col] / df_result["cnt"]
df_result = df_result.rename(columns={"rating": "rating_avg", "cnt": "rating_cnt"})
df_users_clean = df_result
df_users_clean.head()

Unnamed: 0,user_Id,rating_avg,rating_cnt,rating_avg.1,w2v_0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,...,w2v_291,w2v_292,w2v_293,w2v_294,w2v_295,w2v_296,w2v_297,w2v_298,w2v_299,rating_cnt.1
0,6,3.41853,55935.878594,3.682024,0.047119,0.038021,-0.008658,0.061763,-0.006199,0.011819,...,0.001927,-0.110658,-0.009286,-0.01813,-0.04322,-0.039856,-0.048123,0.006335,0.016588,626
1,7,4.011351,52382.61521,3.621344,0.039846,0.035202,-0.000552,0.065487,-0.009792,0.003091,...,0.011999,-0.103366,-0.01365,-0.021188,-0.039596,-0.028995,-0.047761,0.006293,0.020549,881
2,8,4.214286,106124.357143,3.576218,0.029219,0.046266,0.028962,0.060885,-0.016917,0.019689,...,-0.008831,-0.123834,-0.009376,-0.031162,-0.042404,-0.042829,-0.04536,-0.004269,0.017736,98
3,10,3.392308,67170.223077,3.581458,0.045342,0.028131,0.005163,0.062421,-0.017837,0.006805,...,0.006446,-0.117563,-0.005086,-0.029055,-0.038691,-0.02202,-0.047071,0.000898,0.022258,260
4,25,3.481481,91843.074074,3.659802,0.050034,0.038392,0.023527,0.046617,0.021565,-0.016197,...,-0.030786,-0.158192,-0.064261,0.001065,0.007731,0.04297,-0.057525,-0.00065,-0.050495,27


In [41]:
df_movies_clean = pd.merge(df_rating.groupby("movie_Id", as_index=False)["rating"]\
                           .agg(['mean', 'count'])\
                           .rename(columns={"mean": "rating_avg", "count": "rating_cnt"}), df_movies_clean,how='left', on='movie_Id').fillna(0.0)
df_movies_clean.head()

Unnamed: 0,movie_Id,rating_avg_x,rating_cnt_x,rating_cnt_y,rating_avg_y,year,w2v_0,w2v_1,w2v_2,w2v_3,...,w2v_290,w2v_291,w2v_292,w2v_293,w2v_294,w2v_295,w2v_296,w2v_297,w2v_298,w2v_299
0,1,3.749543,547,547,3.749543,2003,0.0791626,0.141602,0.0653076,0.144043,...,-0.170227,0.147827,-0.256836,0.119385,0.0618286,-0.0197067,-0.0344849,0.204407,0.0576172,-0.0251923
1,2,3.558621,145,145,3.558621,2004,-0.0749512,0.0182292,-0.084554,0.0145671,...,0.108398,-0.0535482,-0.0673828,0.0973307,-0.023112,-0.0393066,-0.146362,-0.221191,-0.023763,0.0696615
2,3,3.641153,2012,2012,3.641153,1997,0.257812,-0.0258789,-0.00357056,0.0163574,...,-0.180664,0.208984,-0.235352,-0.283203,-0.188477,0.0142822,0.143555,-0.0393066,-0.120605,0.041748
3,4,2.739437,142,142,2.739437,1994,0.0388997,-0.162272,-0.00537109,0.194661,...,-0.00626628,0.115234,-0.116211,-0.00423177,0.034078,0.0113932,0.0836182,-0.107096,0.0487976,0.0219727
4,5,3.919298,1140,1140,3.919298,2004,-0.00842285,0.0577393,-0.145508,0.188477,...,0.00756836,-0.0305786,-0.166504,-0.0996094,-0.0799561,-0.325195,-0.132568,0.020874,0.1427,0.0246277


In [43]:
df_rating_clean = df_rating
df_movies_clean = df_movies_clean.rename(columns={'movie_Id': 'item_idx'})
df_users_clean = df_users_clean.rename(columns={'user_Id': 'user_idx'})
df_rating_clean = df_rating_clean.rename(columns={'movie_Id': 'item_idx', 'user_Id': 'user_idx', 'rating': 'relevance'})