In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
df = pd.read_csv('../data/ratings_improved.csv', parse_dates=['timestamp'])

In [3]:
users_warm = pd.read_csv('../data/user_features_warm.csv')

In [4]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.2 GB
None
   userId  movieId  rating           timestamp  day_of_week
0       1       17     4.0 1999-12-03 19:24:37            4
1       1       25     1.0 1999-12-03 19:43:48            4
2       1       29     2.0 1999-11-22 00:36:16            0
3       1       30     5.0 1999-12-03 19:24:37            4
4       1       32     5.0 1999-11-22 00:00:58            0


# Sprawdzenie czy dane zgadzają się z danymi z pozostałych zbiorów danych

In [5]:
def check_with_other_datasets(df, users):
    movies = pd.read_parquet('../data/Movies_clean_Vec_v4_25keywords.parquet')

    print('Number of users matches with user_features.csv:', df['userId'].nunique() == users['userId'].nunique())
    print('Number of movies in ratings:', df['movieId'].nunique(), 'Number of movies in Movies_clean_Vec_v4:', movies['movieId'].nunique())

    missing_movie_ids = df[~df['movieId'].isin(movies['movieId'])]
    print('Number of movies that are in ratings, but are not in Movies_clean_Vec_v4', missing_movie_ids['movieId'].nunique())
    
    print('Number of ratings to delete:', missing_movie_ids.shape[0])
    print('Current length:', df.shape)
    df = df[~df['movieId'].isin(missing_movie_ids['movieId'])]
    print('Length after deleting:', df.shape)

    user_counts = missing_movie_ids['userId'].value_counts().reset_index()
    user_counts.columns = ['userId', 'count']

    users_with_deleted_ratings = users[users['userId'].isin(missing_movie_ids['userId'])]
    print(users_with_deleted_ratings.head(3))
    print(len(users_with_deleted_ratings), len(users))
    merged = users_with_deleted_ratings.merge(user_counts, on='userId', how='left')
    merged['count'] = merged['count'].fillna(0)

    merged['remaining_ratings'] = merged['num_rating'] - merged['count']
    filtered = merged[merged['remaining_ratings'] < 20]

    print('Number of users to delete', filtered.shape)
    print('Deleting users that have less than 20 reviews...')
    users = users[~users['userId'].isin(filtered['userId'])]
    df = df[~df['userId'].isin(filtered['userId'])]
    
    print('Written to user_features_2.csv')

    return df, users


df, users_warm = check_with_other_datasets(df, users_warm)

Number of users matches with user_features.csv: True
Number of movies in ratings: 84432 Number of movies in Movies_clean_Vec_v4: 84133
Number of movies that are in ratings, but are not in Movies_clean_Vec_v4 299
Number of ratings to delete: 326
Current length: (32000204, 5)
Length after deleting: (31999878, 5)
      userId  num_rating  avg_rating  weekend_watcher type_of_viewer  \
3318    3319         297    2.501684                1       negative   
3916    3917          42    2.892857                1       negative   
5207    5208          22    3.704545                0        neutral   

      genre_Action  genre_Adventure  genre_Animation  genre_Comedy  \
3318      2.310345         2.303571          2.25000        2.3125   
3916      3.055556         2.900000          2.78125        3.0000   
5207      3.000000              NaN          4.50000        3.8750   

      genre_Crime  ...  genre_History  genre_Horror  genre_Musical  \
3318     2.368852  ...         2.5625      2.585

# Usunięcie użytkowników, którzy nie mają minimum 20 pozytywnych ocen (neg beda dobieranie z nie-ogladanych)

In [6]:
POSITIVE_THRESHOLD = 4
MIN_POS = 20

pos_counts = (
    df
    .groupby('userId')['rating']
    .apply(lambda x: (x >= POSITIVE_THRESHOLD).sum())
)

valid_users = pos_counts[pos_counts >= MIN_POS].index

In [7]:
total_users = pos_counts.shape[0]
kept_users = valid_users.size
removed_users = total_users - kept_users

print(f'Total users      : {total_users}')
print(f'Users kept       : {kept_users}')
print(f'Users removed    : {removed_users} '
      f'({removed_users/total_users*100:.2f}%)')

Total users      : 200941
Users kept       : 157023
Users removed    : 43918 (21.86%)


In [8]:
df = df[df['userId'].isin(valid_users)].copy()

print('\nAfter filtering:')
print(df.info())


After filtering:
<class 'pandas.core.frame.DataFrame'>
Index: 30594215 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.4 GB
None


# Usunięcie użytkowników, którzy nie mają minimum 1 oceny pozytywnej i negatywnej
Trening User Tower bez wykorzystania Item Tower wymaga aby użytkownik miał co najmniej 1 pozytywną ocenę i jedną negatywną ocenę. Użytkownicy z samymi pozytywnymi/negatywnymi ocenami muszą zostać usunięci.

In [9]:
# POSITIVE_THRESHOLD = 4
#
# groupped = df.groupby('userId')
#
# valid_users = set()
# for userId, group in groupped:
#     pos = group[group['rating'] >= POSITIVE_THRESHOLD]
#     neg = group[group['rating'] < POSITIVE_THRESHOLD]
#     if not pos.empty and not neg.empty:
#         valid_users.add(userId)
#
# df = df[df['userId'].isin(valid_users)]
# print(df.info())

# Usunięcie usuniętych użytkowników ze zbioru danych user_features i zapis go do pliku

In [10]:
users_Tr = users_warm[users_warm['userId'].isin(valid_users)]

# users_Tr.to_csv('../data/user_features_warm_2.csv', index=False)

# Normalizacja i przetwarzanie timestamp-ow

In [11]:
df['ts_sec'] = df['timestamp'].astype('int64') // 10**9

In [12]:
df_train_ratings = df.copy()

print(f"Number of ratings: {len(df_train_ratings):,}")

Number of ratings: 30,594,215


In [13]:
train_cutoff = df_train_ratings['ts_sec'].max()

df_train_ratings['age_days'] = (train_cutoff - df_train_ratings['ts_sec']) / 86400
df_train_ratings['age_days'] = df_train_ratings['age_days'].clip(0, 3650)

print(f"Age_days range: {df_train_ratings['age_days'].min():.1f} - {df_train_ratings['age_days'].max():.1f}")

Age_days range: 0.0 - 3650.0


In [14]:
def jitter_data(df):
    np.random.seed(42)
    jitter_std = 0.05
    jitter = np.random.normal(0, jitter_std, len(df))

    df_grouped = df.groupby(['userId', 'age_days']).size()
    duplicates_mask = df.set_index(['userId', 'age_days']).index.isin(df_grouped[df_grouped > 1].index)

    df.loc[duplicates_mask, 'age_days_jittered'] = (df.loc[duplicates_mask, 'age_days'] + jitter[duplicates_mask])
    df.loc[~duplicates_mask, 'age_days_jittered'] = df.loc[~duplicates_mask, 'age_days']

    before_duplicates = df.groupby(['userId', 'age_days']).size().max()
    after_duplicates = df.groupby(['userId', 'age_days_jittered']).size().max()

    print(f"Max duplikatów przed: {before_duplicates}")
    print(f"Max duplikatów po: {after_duplicates}")

    return df

df_train_ratings = jitter_data(df_train_ratings)

Max duplikatów przed: 8983
Max duplikatów po: 1


In [15]:
df_train_ratings.head(100)
df_train_ratings['age_days'] = df_train_ratings['age_days_jittered'].astype('float32')

In [16]:
from sklearn.preprocessing import StandardScaler

columns_to_normalize = ['age_days']

scaler = StandardScaler().fit(df_train_ratings[columns_to_normalize])

df_train_scaled = df_train_ratings.copy()

df_train_scaled[columns_to_normalize] = scaler.transform(df_train_ratings[columns_to_normalize])

train_mean = df_train_scaled['age_days'].mean()
train_std = df_train_scaled['age_days'].std()

print(f"Train po normalizacji: mean={train_mean:.3f}, std={train_std:.3f}")

Train po normalizacji: mean=0.000, std=1.000


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30594215 entries, 0 to 32000203
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
 5   ts_sec       int64         
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 1.6 GB


In [18]:
df = df_train_scaled

df = df.sort_values('userId').reset_index(drop=True)

print(f"Total records: {len(df):,}")

Total records: 30,594,215


In [19]:
df = df.drop(columns=['ts_sec', 'timestamp', 'age_days_jittered'])

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30594215 entries, 0 to 30594214
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   day_of_week  int64  
 4   age_days     float32
dtypes: float32(1), float64(1), int64(3)
memory usage: 1.0 GB


In [21]:
# dump the scaler
joblib.dump(scaler, '../data/age-days-scaler.joblib')

['../data/age-days-scaler.joblib']

# Zapis do pliku

In [21]:
df.to_parquet('../data/ratings_clean.parquet', compression='brotli')

In [22]:
print(df.head())

   userId  movieId  rating  day_of_week  age_days
0       1       17     4.0            4  0.717920
1       1     2028     3.0            0  0.717860
2       1     2025     5.0            4  0.717881
3       1     2020     5.0            4  0.717877
4       1     1968     5.0            2  0.717831


# Przygotowanie LOOCV

In [23]:
def split_loocv(df):

    df_sorted = df.sort_values(['userId', 'age_days'], ascending=[True, True])

    mask_pos = df_sorted['rating'] >= POSITIVE_THRESHOLD
    last_pos = df_sorted[mask_pos].groupby('userId', as_index=False).tail(1)
    holdout_df = last_pos[['userId', 'movieId']].rename(
        columns={'movieId': 'holdout_movieId'}
    )

    df_clean = df_sorted.drop(index=last_pos.index).reset_index(drop=True)

    return df_clean, holdout_df

df, holdout_df = split_loocv(df)

print(holdout_df.head())
print(df.shape)

     userId  holdout_movieId
59        1             2313
151       2              520
282       3              356
354       7              541
414       8              858
(30437192, 5)


In [24]:
rating_checker = pd.read_parquet('../data/ratings_clean.parquet')

check = holdout_df.merge(
    rating_checker,
    left_on=["userId", "holdout_movieId"],
    right_on=["userId", "movieId"],
    how="left",
    indicator=True
)

missing = check[check["_merge"] == "left_only"]

if not missing.empty:
    print(f"Znaleziono {len(missing)} brakujących wpisów:")
    print(missing[["userId", "holdout_movieId"]])
else:
    print("Wszystkie holdout_movieId są obecne w rating_clean.parquet")

Wszystkie holdout_movieId są obecne w rating_clean.parquet


In [25]:
holdout_df.to_parquet('../data/ratings_LOOCV.parquet', compression='brotli')

# Zapisanie zgrupowanych danych

In [26]:
groupped = df.sort_values(['userId', 'age_days'], ascending=[True, True])
groupped = groupped.groupby('userId').head(19).sort_values(['userId', 'age_days'])
groupped = groupped.groupby('userId').agg(movies_seq = ('movieId', list), ratings_seq = ('rating', list), ts_seq = ('age_days', list)).reset_index()

print(groupped.head())
print(groupped.info())

# Sprawdzenie czy wszystkie movies_seq mają minimum 19 filmów
assert groupped['movies_seq'].apply(len).eq(19).all()

groupped.to_parquet('../data/ratings_clean_groupped_20.parquet', compression='brotli')

   userId                                         movies_seq  \
0       1  [1263, 1296, 926, 2247, 223, 1150, 232, 2882, ...   
1       2  [34, 595, 186, 185, 276, 552, 296, 364, 551, 5...   
2       3  [1375, 552, 2701, 357, 367, 1090, 1957, 1721, ...   
3       7  [592, 162, 165, 296, 588, 339, 19, 586, 434, 5...   
4       8  [27773, 7361, 30707, 30749, 4226, 47, 44555, 4...   

                                         ratings_seq  \
0  [5.0, 3.0, 5.0, 5.0, 3.0, 4.0, 5.0, 1.0, 1.0, ...   
1  [5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 1.0, 5.0, 4.0, ...   
2  [4.0, 4.0, 3.0, 4.0, 2.5, 4.0, 5.0, 4.0, 3.0, ...   
3  [3.0, 5.0, 3.0, 5.0, 4.0, 2.0, 3.0, 3.0, 3.0, ...   
4  [5.0, 4.5, 4.0, 5.0, 4.5, 5.0, 4.5, 3.5, 4.0, ...   

                                              ts_seq  
0  [0.7177751660346985, 0.7178045511245728, 0.717...  
1  [0.717822253704071, 0.7178265452384949, 0.7178...  
2  [0.717746376991272, 0.7178027629852295, 0.7178...  
3  [0.7177982330322266, 0.7178294658660889, 0.717...  
4  

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na tylko pozytywne

In [27]:
pos_s = (df.loc[df['rating'] >= 4, ['userId', 'movieId']].groupby('userId')['movieId'].agg(list).rename('pos'))
seen_s = (df.groupby('userId')['movieId'].agg(lambda x: set(x)).rename('seen'))

user_ratings = (seen_s.to_frame().join(pos_s.to_frame()))

# user_ratings['pos'] = user_ratings['pos'].apply(
#     lambda v: v if isinstance(v, list) else []
# )

user_ratings = user_ratings.reset_index()
user_ratings = user_ratings.rename(columns={'index': 'userId'})

assert 'userId' in user_ratings.columns
assert user_ratings[['seen', 'pos']].applymap(type).isin({set, list}).all().all()
print(user_ratings.dtypes)
print(user_ratings.head())


userId     int64
seen      object
pos       object
dtype: object
   userId                                               seen  \
0       1  {3078, 527, 2064, 1041, 3088, 2067, 17, 25, 54...   
1       2  {783, 276, 153, 282, 539, 31, 34, 551, 552, 29...   
2       3  {2, 1031, 1544, 10, 11, 527, 17, 534, 26, 539,...   
3       7  {265, 19, 531, 21, 150, 153, 410, 539, 288, 16...   
4       8  {6016, 4226, 260, 44555, 55820, 2959, 912, 527...   

                                                 pos  
0  [1263, 926, 2247, 1150, 232, 541, 1968, 322, 2...  
1  [34, 595, 186, 185, 276, 552, 364, 551, 594, 3...  
2  [1375, 552, 357, 1090, 1957, 1721, 858, 1544, ...  
3  [162, 296, 588, 508, 208, 590, 150, 318, 349, ...  
4  [27773, 7361, 30707, 30749, 4226, 47, 44555, 3...  


  assert user_ratings[['seen', 'pos']].applymap(type).isin({set, list}).all().all()


In [28]:
user_ratings.info()
user_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157023 entries, 0 to 157022
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  157023 non-null  int64 
 1   seen    157023 non-null  object
 2   pos     157023 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.6+ MB


Unnamed: 0,userId,seen,pos
0,1,"{3078, 527, 2064, 1041, 3088, 2067, 17, 25, 54...","[1263, 926, 2247, 1150, 232, 541, 1968, 322, 2..."
1,2,"{783, 276, 153, 282, 539, 31, 34, 551, 552, 29...","[34, 595, 186, 185, 276, 552, 364, 551, 594, 3..."
2,3,"{2, 1031, 1544, 10, 11, 527, 17, 534, 26, 539,...","[1375, 552, 357, 1090, 1957, 1721, 858, 1544, ..."
3,7,"{265, 19, 531, 21, 150, 153, 410, 539, 288, 16...","[162, 296, 588, 508, 208, 590, 150, 318, 349, ..."
4,8,"{6016, 4226, 260, 44555, 55820, 2959, 912, 527...","[27773, 7361, 30707, 30749, 4226, 47, 44555, 3..."
5,9,"{3328, 3972, 33672, 3081, 8957, 1673, 2700, 50...","[3081, 2542, 38061, 2329, 3275, 5026, 1500, 38..."
6,10,"{169984, 1, 122882, 2, 122886, 45062, 10, 6964...","[168252, 122904, 122886, 112852, 33493, 527, 7..."
7,13,"{6, 2058, 2571, 1036, 1674, 11, 527, 912, 920,...","[1088, 587, 1704, 1240, 2359, 457, 1233, 593, ..."
8,14,"{33410, 5768, 4619, 7437, 4622, 7706, 4510, 87...","[4510, 4622, 8871, 8734, 5768, 3911, 4816, 776..."
9,15,"{3079, 1032, 17, 18, 5668, 74789, 39, 56367, 1...","[55247, 2918, 147, 1968, 47610, 4246, 1079, 27..."


## Zapis do pliku

In [29]:
user_ratings.to_parquet('../data/ratings_groupped_20pos.parquet', compression='brotli')

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na pozytywne i negatywne

In [30]:
# pos_df = df[df['rating'] >= 4].groupby('userId')['movieId'].apply(list).reset_index(name='pos')
# neg_df = df[df['rating'] < 4].groupby('userId')['movieId'].apply(list).reset_index(name='neg')
#
# print(pos_df.head())
#
# user_ratings = pd.merge(pos_df, neg_df, on='userId', how='outer')
#
# print(user_ratings.info())
#
# user_ratings['pos'] = user_ratings['pos'].apply(lambda x: x if isinstance(x, list) else [])
# user_ratings['neg'] = user_ratings['neg'].apply(lambda x: x if isinstance(x, list) else [])
#
# user_ratings = user_ratings.reset_index(drop=True)
#
# print(user_ratings.info())
# print(user_ratings.head())

## Zapis do pliku

In [31]:
# user_ratings.to_parquet('../data/ratings_groupped_ids.parquet', compression='brotli')

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na pozytywne i neg (z wagami)

In [32]:
# def rating_weight(r):
#     if r >= 4:
#         return 1.0
#     elif r >= 3:
#         return 0.5
#     else:
#         return 0.2
#
# df['weight'] = df['rating'].apply(rating_weight)
#
# pos_df = df[df['rating'] >= 4].groupby('userId')['movieId'].apply(list).reset_index(name='pos')
# neg_df = df[df['rating'] < 4].groupby('userId')['movieId'].apply(list).reset_index(name='neg')
#
# print(pos_df.head())
#
# user_ratings = pd.merge(pos_df, neg_df, on='userId', how='outer')
#
# print(user_ratings.info())
#
# user_ratings['pos'] = user_ratings['pos'].apply(lambda x: x if isinstance(x, list) else [])
# user_ratings['neg'] = user_ratings['neg'].apply(lambda x: x if isinstance(x, list) else [])
#
# user_ratings = user_ratings.reset_index(drop=True)
#
# def attach_weights(row):
#     uid = row['userId']
#     grp = df[df['userId'] == uid]
#     pos_w = [(mid, w) for mid, w in zip(
#         grp.loc[grp['rating'] >= 4, 'movieId'],
#         grp.loc[grp['rating'] >= 4, 'weight']
#     )]
#     neg_w = [(mid, w) for mid, w in zip(
#         grp.loc[grp['rating'] < 4, 'movieId'],
#         grp.loc[grp['rating'] < 4, 'weight']
#     )]
#     return pd.Series({'pos_w': pos_w, 'neg_w': neg_w})
#
# user_ratings = user_ratings.join(
#     user_ratings.apply(attach_weights, axis=1)
# )
#
# print(user_ratings.info())
# print(user_ratings.head())

## Zapis do pliku

In [33]:
# user_ratings.to_parquet('../data/ratings_groupped_ids_weights.parquet', compression='brotli')