# Ratings_cleaning musi zostać uruchomiony jako pierwszy
Notebook `Ratings_cleaning.ipynb` usuwa niektóre z wystawionych ocen, co zmniejsza ich liczbę u użytkowników. Następnie usuwa użytkowników, którzy mają mniej niż 20 ocen i zapisuje nowy zbiór danych do pliku `user_features_warm_2.csv`.

In [3]:
import pandas as pd
import joblib

In [4]:
df = pd.read_csv('../data/user_features_warm_2.csv')

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155225 entries, 0 to 155224
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   userId                 155225 non-null  int64  
 1   num_rating             155225 non-null  int64  
 2   avg_rating             155225 non-null  float64
 3   weekend_watcher        155225 non-null  int64  
 4   type_of_viewer         155225 non-null  object 
 5   genre_Action           154726 non-null  float64
 6   genre_Adventure        154874 non-null  float64
 7   genre_Animation        140806 non-null  float64
 8   genre_Comedy           155045 non-null  float64
 9   genre_Crime            154396 non-null  float64
 10  genre_Documentary      67611 non-null   float64
 11  genre_Drama            155200 non-null  float64
 12  genre_Family           148582 non-null  float64
 13  genre_Fantasy          153589 non-null  float64
 14  genre_History          141999 non-nu

# Uzupełnienie brakujących ocen 

In [6]:
genre_columns = ['genre_Action','genre_Adventure','genre_Animation','genre_Comedy','genre_Crime','genre_Documentary','genre_Drama','genre_Family','genre_Fantasy','genre_History','genre_Horror','genre_Musical','genre_Mystery','genre_Romance','genre_Science Fiction','genre_TV Movie','genre_Thriller','genre_War','genre_Western']

for col in genre_columns:
    df[col] = df[col].fillna(df['avg_rating'])

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155225 entries, 0 to 155224
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   userId                 155225 non-null  int64  
 1   num_rating             155225 non-null  int64  
 2   avg_rating             155225 non-null  float64
 3   weekend_watcher        155225 non-null  int64  
 4   type_of_viewer         155225 non-null  object 
 5   genre_Action           155225 non-null  float64
 6   genre_Adventure        155225 non-null  float64
 7   genre_Animation        155225 non-null  float64
 8   genre_Comedy           155225 non-null  float64
 9   genre_Crime            155225 non-null  float64
 10  genre_Documentary      155225 non-null  float64
 11  genre_Drama            155225 non-null  float64
 12  genre_Family           155225 non-null  float64
 13  genre_Fantasy          155225 non-null  float64
 14  genre_History          155225 non-nu

# Enkodowanie kolumny 'type_of_viewer'

In [7]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

def encoder(df):
    encoded = ohe.fit_transform(df[['type_of_viewer']])
    encoded = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(['type_of_viewer']))
    encoded.index = df.index
    df = df.drop(columns=['type_of_viewer'])
    df = pd.concat([df, encoded], axis=1)

    return df

df = encoder(df)

print(df.head(3))

   userId  num_rating  avg_rating  weekend_watcher  genre_Action  \
0       1         141    3.531915                0      4.133333   
1       2          52    4.269231                0      4.000000   
2       3         147    3.588435                0      3.583333   

   genre_Adventure  genre_Animation  genre_Comedy  genre_Crime  \
0         3.333333         3.531915      3.339623     4.285714   
1         4.142857         4.875000      4.130435     4.000000   
2         3.605263         3.961538      3.186275     3.346154   

   genre_Documentary  ...  genre_Mystery  genre_Romance  \
0           3.000000  ...       4.272727       3.487805   
1           4.269231  ...       4.750000       4.823529   
2           3.588435  ...       3.875000       3.637931   

   genre_Science Fiction  genre_TV Movie  genre_Thriller  genre_War  \
0               4.000000        3.531915        3.666667   3.555556   
1               4.269231        4.269231        4.300000   3.000000   
2           

# Dołączenie ostatnich 20 ocenionych filmów ze zbioru ratings_clean_zlib.h5

In [8]:
ratings_df = pd.read_parquet('../data/ratings_clean_groupped_20.parquet')
df = pd.merge(df, ratings_df, on='userId')

# Normalizacja danych numerycznych

In [9]:
from sklearn.preprocessing import StandardScaler

columns_to_normalize = ['num_rating', 'avg_rating']
columns_to_normalize.extend(genre_columns)
print('Columns to normalize:', columns_to_normalize)

scaler = StandardScaler()

df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
print(df.head(3))

Columns to normalize: ['num_rating', 'avg_rating', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Musical', 'genre_Mystery', 'genre_Romance', 'genre_Science Fiction', 'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western']
   userId  num_rating  avg_rating  weekend_watcher  genre_Action  \
0       1   -0.178909   -0.530284                0      0.877051   
1       2   -0.465772    1.136851                0      0.646248   
2       3   -0.159570   -0.402486                0     -0.075011   

   genre_Adventure  genre_Animation  genre_Comedy  genre_Crime  \
0        -0.537924        -0.286397     -0.587008     0.873865   
1         0.890741         1.583900      0.914432     0.335790   
2        -0.058017         0.311870     -0.878156    -0.895572   

   genre_Documentary  ...  genre_TV Movie  genre_Thriller  genre_War  \
0         

In [10]:
# dump the scaler
joblib.dump(scaler, '../data/ratings-scaler.joblib')

['../data/ratings-scaler.joblib']

# Konwersja kolumny weekend_watcher do float64

In [11]:
df['weekend_watcher'] = df['weekend_watcher'].astype('float64')

In [12]:
print('\', \''.join(df.columns.to_list()))

userId', 'num_rating', 'avg_rating', 'weekend_watcher', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Musical', 'genre_Mystery', 'genre_Romance', 'genre_Science Fiction', 'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western', 'type_of_viewer_negative', 'type_of_viewer_neutral', 'type_of_viewer_positive', 'movies_seq', 'ratings_seq', 'ts_seq


# Zapis do pliku

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155225 entries, 0 to 155224
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   userId                   155225 non-null  int64  
 1   num_rating               155225 non-null  float64
 2   avg_rating               155225 non-null  float64
 3   weekend_watcher          155225 non-null  float64
 4   genre_Action             155225 non-null  float64
 5   genre_Adventure          155225 non-null  float64
 6   genre_Animation          155225 non-null  float64
 7   genre_Comedy             155225 non-null  float64
 8   genre_Crime              155225 non-null  float64
 9   genre_Documentary        155225 non-null  float64
 10  genre_Drama              155225 non-null  float64
 11  genre_Family             155225 non-null  float64
 12  genre_Fantasy            155225 non-null  float64
 13  genre_History            155225 non-null  float64
 14  genr

In [14]:
df.to_parquet('../data/user_features_clean_warm.parquet', compression='brotli')