In [75]:
import pandas as pd
import numpy as np

In [76]:
df = pd.read_csv('../data/ratings_improved.csv', parse_dates=['timestamp'])

In [77]:
users_train = pd.read_csv('../data/user_features_train.csv')
users_test = pd.read_csv('../data/user_features_test.csv')

In [78]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.2 GB
None
   userId  movieId  rating           timestamp  day_of_week
0       1       17     4.0 1999-12-03 19:24:37            4
1       1       25     1.0 1999-12-03 19:43:48            4
2       1       29     2.0 1999-11-22 00:36:16            0
3       1       30     5.0 1999-12-03 19:24:37            4
4       1       32     5.0 1999-11-22 00:00:58            0


# Sprawdzenie czy dane zgadzają się z danymi z pozostałych zbiorów danych

In [79]:
def check_with_other_datasets(df, users):
    movies = pd.read_parquet('../data/Movies_clean_Vec_v4_25keywords.parquet')

    print('Number of users matches with user_features.csv:', df['userId'].nunique() == users['userId'].nunique())
    print('Number of movies in ratings:', df['movieId'].nunique(), 'Number of movies in Movies_clean_Vec_v4:', movies['movieId'].nunique())

    missing_movie_ids = df[~df['movieId'].isin(movies['movieId'])]
    print('Number of movies that are in ratings, but are not in Movies_clean_Vec_v4', missing_movie_ids['movieId'].nunique())
    
    print('Number of ratings to delete:', missing_movie_ids.shape[0])
    print('Current length:', df.shape)
    df = df[~df['movieId'].isin(missing_movie_ids['movieId'])]
    print('Length after deleting:', df.shape)

    user_counts = missing_movie_ids['userId'].value_counts().reset_index()
    user_counts.columns = ['userId', 'count']

    users_with_deleted_ratings = users[users['userId'].isin(missing_movie_ids['userId'])]
    print(users_with_deleted_ratings.head(3))
    print(len(users_with_deleted_ratings), len(users))
    merged = users_with_deleted_ratings.merge(user_counts, on='userId', how='left')
    merged['count'] = merged['count'].fillna(0)

    merged['remaining_ratings'] = merged['num_rating'] - merged['count']
    filtered = merged[merged['remaining_ratings'] < 20]

    print('Number of users to delete', filtered.shape)
    print('Deleting users that have less than 20 reviews...')
    users = users[~users['userId'].isin(filtered['userId'])]
    df = df[~df['userId'].isin(filtered['userId'])]
    
    print('Written to user_features_2.csv')

    return df, users


df, users_train = check_with_other_datasets(df, users_train)
df, users_test = check_with_other_datasets(df, users_test)

Number of users matches with user_features.csv: False
Number of movies in ratings: 84432 Number of movies in Movies_clean_Vec_v4: 84133
Number of movies that are in ratings, but are not in Movies_clean_Vec_v4 299
Number of ratings to delete: 326
Current length: (32000204, 5)
Length after deleting: (31999878, 5)
      userId  num_rating  avg_rating  weekend_watcher type_of_viewer  \
4173    5208          22    3.704545                0        neutral   
5220    6515          39    3.217949                0        neutral   
5582    6955          28    4.000000                0        neutral   

      genre_Action  genre_Adventure  genre_Animation  genre_Comedy  \
4173      3.000000              NaN             4.50      3.875000   
5220      3.250000         3.388889             2.00      2.333333   
5582      3.916667         3.500000             3.25      3.666667   

      genre_Crime  ...  genre_History  genre_Horror  genre_Musical  \
4173     3.500000  ...            NaN          

# Usunięcie użytkowników, którzy nie mają minimum 20 pozytywnych ocen (neg beda dobieranie z nie-ogladanych)

In [80]:
POSITIVE_THRESHOLD = 4
MIN_POS = 20

pos_counts = (
    df
    .groupby('userId')['rating']
    .apply(lambda x: (x >= POSITIVE_THRESHOLD).sum())
)

valid_users = pos_counts[pos_counts >= MIN_POS].index

In [81]:
total_users = pos_counts.shape[0]
kept_users = valid_users.size
removed_users = total_users - kept_users

print(f'Total users      : {total_users}')
print(f'Users kept       : {kept_users}')
print(f'Users removed    : {removed_users} '
      f'({removed_users/total_users*100:.2f}%)')

Total users      : 200943
Users kept       : 157023
Users removed    : 43920 (21.86%)


In [82]:
df = df[df['userId'].isin(valid_users)].copy()

print('\nAfter filtering:')
print(df.info())


After filtering:
<class 'pandas.core.frame.DataFrame'>
Index: 30594215 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.4 GB
None


# Usunięcie użytkowników, którzy nie mają minimum 1 oceny pozytywnej i negatywnej
Trening User Tower bez wykorzystania Item Tower wymaga aby użytkownik miał co najmniej 1 pozytywną ocenę i jedną negatywną ocenę. Użytkownicy z samymi pozytywnymi/negatywnymi ocenami muszą zostać usunięci.

In [83]:
# POSITIVE_THRESHOLD = 4
#
# groupped = df.groupby('userId')
#
# valid_users = set()
# for userId, group in groupped:
#     pos = group[group['rating'] >= POSITIVE_THRESHOLD]
#     neg = group[group['rating'] < POSITIVE_THRESHOLD]
#     if not pos.empty and not neg.empty:
#         valid_users.add(userId)
#
# df = df[df['userId'].isin(valid_users)]
# print(df.info())

# Usunięcie usuniętych użytkowników ze zbioru danych user_features i zapis go do pliku

In [84]:
users_Tr = users_train[users_train['userId'].isin(valid_users)]

users_Tr.to_csv('../data/user_features_train_2.csv', index=False)

In [85]:
users_Te = users_test[users_test['userId'].isin(valid_users)]

users_Te.to_csv('../data/user_features_test_2.csv', index=False)

# Normalizacja i przetwarzanie timestamp-ow

In [86]:
df['ts_sec'] = df['timestamp'].astype('int64') // 10**9

In [87]:
'''
Dzielimy odpowiednio na zbior ratingow w zbiorze treningowym i testowym
'''
train_mask = df['userId'].isin(users_train['userId'])
test_mask = df['userId'].isin(users_test['userId'])

In [88]:
df_train_ratings = df.loc[train_mask].copy()
df_test_ratings = df.loc[test_mask].copy()

print(f"Train ratings: {len(df_train_ratings):,}")
print(f"Test ratings: {len(df_test_ratings):,}")

Train ratings: 24,477,775
Test ratings: 6,116,440


In [89]:
train_cutoff = df_train_ratings['ts_sec'].max()

df_train_ratings['age_days'] = (train_cutoff - df_train_ratings['ts_sec']) / 86400
df_train_ratings['age_days'] = df_train_ratings['age_days'].clip(0, 3650)

df_test_ratings['age_days'] = (train_cutoff - df_test_ratings['ts_sec']) / 86400
df_test_ratings['age_days'] = df_test_ratings['age_days'].clip(0, 3650)

print(f"Train age_days range: {df_train_ratings['age_days'].min():.1f} - {df_train_ratings['age_days'].max():.1f}")
print(f"Test age_days range: {df_test_ratings['age_days'].min():.1f} - {df_test_ratings['age_days'].max():.1f}")

Train age_days range: 0.0 - 3650.0
Test age_days range: 0.0 - 3650.0


In [90]:
def jitter_data(df):
    np.random.seed(42)
    jitter_std = 0.05
    jitter = np.random.normal(0, jitter_std, len(df))

    df_grouped = df.groupby(['userId', 'age_days']).size()
    duplicates_mask = df.set_index(['userId', 'age_days']).index.isin(df_grouped[df_grouped > 1].index)

    df.loc[duplicates_mask, 'age_days_jittered'] = (df.loc[duplicates_mask, 'age_days'] + jitter[duplicates_mask])
    df.loc[~duplicates_mask, 'age_days_jittered'] = df.loc[~duplicates_mask, 'age_days']

    before_duplicates = df.groupby(['userId', 'age_days']).size().max()
    after_duplicates = df.groupby(['userId', 'age_days_jittered']).size().max()

    print(f"Max duplikatów przed: {before_duplicates}")
    print(f"Max duplikatów po: {after_duplicates}")

    return df

df_test_ratings = jitter_data(df_test_ratings)
df_train_ratings = jitter_data(df_train_ratings)

Max duplikatów przed: 5410
Max duplikatów po: 1
Max duplikatów przed: 8983
Max duplikatów po: 1


In [91]:
df_train_ratings.head(100)
df_train_ratings['age_days'] = df_train_ratings['age_days_jittered'].astype('float32')
df_test_ratings['age_days'] = df_train_ratings['age_days_jittered'].astype('float32')

In [92]:
from sklearn.preprocessing import StandardScaler

columns_to_normalize = ['age_days']

scaler = StandardScaler().fit(df_train_ratings[columns_to_normalize])

df_train_scaled = df_train_ratings.copy()
df_test_scaled = df_test_ratings.copy()

df_train_scaled[columns_to_normalize] = scaler.transform(df_train_ratings[columns_to_normalize])
df_test_scaled[columns_to_normalize] = scaler.transform(df_test_ratings[columns_to_normalize])

train_mean = df_train_scaled['age_days'].mean()
train_std = df_train_scaled['age_days'].std()

print(f"Train po normalizacji: mean={train_mean:.3f}, std={train_std:.3f}")

Train po normalizacji: mean=0.000, std=1.000


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30594215 entries, 0 to 32000203
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
 5   ts_sec       int64         
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 1.6 GB


In [94]:
df_train_scaled['split'] = 'train'
df_test_scaled['split'] = 'test'

df = pd.concat([df_train_scaled, df_test_scaled], ignore_index=True)

df = df.sort_values('userId').reset_index(drop=True)

print(f" - Total records: {len(df):,}")
print(f" - Train records: {(df['split'] == 'train').sum():,}")
print(f" - Test records: {(df['split'] == 'test').sum():,}")

 - Total records: 30,594,215
 - Train records: 24,477,775
 - Test records: 6,116,440


In [95]:
df = df.drop(columns=['ts_sec', 'timestamp', 'split', 'age_days_jittered'])

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30594215 entries, 0 to 30594214
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   day_of_week  int64  
 4   age_days     float32
dtypes: float32(1), float64(1), int64(3)
memory usage: 1.0 GB


# Zapis do pliku

In [97]:
df.to_parquet('../data/ratings_clean.parquet', compression='brotli')

In [98]:
print(df.head())

   userId  movieId  rating  day_of_week  age_days
0       1       17     4.0            4  0.716060
1       1     2064     3.0            2  0.716029
2       1     2028     3.0            0  0.715999
3       1     2025     5.0            4  0.716021
4       1     2020     5.0            4  0.716017


# Zapisanie zgrupowanych danych

In [99]:
groupped = df.sort_values(['userId', 'age_days'], ascending=[True, True])
groupped = groupped.groupby('userId').head(20).sort_values(['userId', 'age_days'])
groupped = groupped.groupby('userId').agg(movies_seq = ('movieId', list), ratings_seq = ('rating', list), ts_seq = ('age_days', list)).reset_index()

print(groupped.head())
print(groupped.info())

# Sprawdzenie czy wszystkie movies_seq mają minimum 20 filmów
assert groupped['movies_seq'].apply(len).eq(20).all()

groupped.to_parquet('../data/ratings_clean_groupped_20.parquet', compression='brotli')

   userId                                         movies_seq  \
0       1  [1263, 1296, 926, 2247, 223, 1150, 232, 2882, ...   
1       2  [34, 595, 186, 185, 276, 552, 296, 364, 551, 5...   
2       3  [1375, 552, 2701, 357, 367, 1090, 1957, 1721, ...   
3       7  [457, 380, 410, 432, 434, 435, 440, 367, 468, ...   
4       8  [27773, 30749, 30707, 7361, 4226, 47, 32, 4455...   

                                         ratings_seq  \
0  [5.0, 3.0, 5.0, 5.0, 3.0, 4.0, 5.0, 1.0, 1.0, ...   
1  [5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 1.0, 5.0, 4.0, ...   
2  [4.0, 4.0, 3.0, 4.0, 2.5, 4.0, 5.0, 4.0, 3.0, ...   
3  [3.0, 2.0, 4.0, 3.0, 3.0, 2.0, 4.0, 3.0, 4.0, ...   
4  [5.0, 5.0, 4.0, 4.5, 4.5, 5.0, 4.0, 4.5, 3.5, ...   

                                              ts_seq  
0  [0.7159150838851929, 0.7159445285797119, 0.715...  
1  [0.7159621715545654, 0.7159664630889893, 0.715...  
2  [0.7158863544464111, 0.7159426808357239, 0.715...  
3  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...  
4  

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na tylko pozytywne

In [100]:
pos_s = (df.loc[df['rating'] >= 4, ['userId', 'movieId']].groupby('userId')['movieId'].agg(list).rename('pos'))
seen_s = (df.groupby('userId')['movieId'].agg(lambda x: set(x)).rename('seen'))

user_ratings = (seen_s.to_frame().join(pos_s.to_frame()))

# user_ratings['pos'] = user_ratings['pos'].apply(
#     lambda v: v if isinstance(v, list) else []
# )

user_ratings = user_ratings.reset_index()
user_ratings = user_ratings.rename(columns={'index': 'userId'})

assert 'userId' in user_ratings.columns
assert user_ratings[['seen', 'pos']].applymap(type).isin({set, list}).all().all()
print(user_ratings.dtypes)
print(user_ratings.head())


userId     int64
seen      object
pos       object
dtype: object
   userId                                               seen  \
0       1  {3078, 527, 2064, 17, 3088, 2067, 1041, 25, 29...   
1       2  {520, 783, 276, 153, 282, 539, 31, 34, 551, 55...   
2       3  {2, 1031, 1544, 10, 11, 527, 17, 534, 26, 539,...   
3       7  {265, 531, 19, 21, 150, 153, 410, 539, 541, 28...   
4       8  {6016, 4226, 260, 44555, 55820, 2959, 527, 912...   

                                                 pos  
0  [17, 2025, 2020, 1968, 1952, 1939, 1885, 1748,...  
1  [500, 457, 454, 377, 380, 364, 508, 362, 381, ...  
2  [2948, 3034, 3210, 2947, 3247, 3448, 3578, 324...  
3  [410, 440, 468, 588, 508, 531, 541, 590, 357, ...  
4  [32, 188773, 183837, 79132, 58559, 55820, 4455...  


  assert user_ratings[['seen', 'pos']].applymap(type).isin({set, list}).all().all()


In [101]:
user_ratings.info()
user_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157023 entries, 0 to 157022
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  157023 non-null  int64 
 1   seen    157023 non-null  object
 2   pos     157023 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.6+ MB


Unnamed: 0,userId,seen,pos
0,1,"{3078, 527, 2064, 17, 3088, 2067, 1041, 25, 29...","[17, 2025, 2020, 1968, 1952, 1939, 1885, 1748,..."
1,2,"{520, 783, 276, 153, 282, 539, 31, 34, 551, 55...","[500, 457, 454, 377, 380, 364, 508, 362, 381, ..."
2,3,"{2, 1031, 1544, 10, 11, 527, 17, 534, 26, 539,...","[2948, 3034, 3210, 2947, 3247, 3448, 3578, 324..."
3,7,"{265, 531, 19, 21, 150, 153, 410, 539, 541, 28...","[410, 440, 468, 588, 508, 531, 541, 590, 357, ..."
4,8,"{6016, 4226, 260, 44555, 55820, 2959, 527, 912...","[32, 188773, 183837, 79132, 58559, 55820, 4455..."
5,9,"{3328, 3972, 33672, 1673, 3081, 2700, 5009, 21...","[32, 47, 50, 1200, 293, 318, 353, 1079, 1089, ..."
6,10,"{169984, 1, 122882, 2, 45062, 122886, 10, 6964...","[49396, 51662, 54995, 58559, 54999, 56757, 344..."
7,13,"{6, 2058, 1674, 2571, 11, 1036, 527, 912, 920,...","[2268, 1610, 1608, 1721, 1704, 3252, 2762, 285..."
8,14,"{33410, 5768, 4619, 7437, 4622, 7706, 8734, 23...","[437, 1238, 1903, 2246, 2261, 54004, 53189, 26..."
9,15,"{3079, 1032, 17, 18, 5668, 74789, 39, 56367, 6...","[7153, 6711, 5989, 5970, 5668, 5013, 4995, 497..."


## Zapis do pliku

In [102]:
user_ratings.to_parquet('../data/ratings_groupped_20pos.parquet', compression='brotli')

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na pozytywne i negatywne

In [103]:
# pos_df = df[df['rating'] >= 4].groupby('userId')['movieId'].apply(list).reset_index(name='pos')
# neg_df = df[df['rating'] < 4].groupby('userId')['movieId'].apply(list).reset_index(name='neg')
#
# print(pos_df.head())
#
# user_ratings = pd.merge(pos_df, neg_df, on='userId', how='outer')
#
# print(user_ratings.info())
#
# user_ratings['pos'] = user_ratings['pos'].apply(lambda x: x if isinstance(x, list) else [])
# user_ratings['neg'] = user_ratings['neg'].apply(lambda x: x if isinstance(x, list) else [])
#
# user_ratings = user_ratings.reset_index(drop=True)
#
# print(user_ratings.info())
# print(user_ratings.head())

## Zapis do pliku

In [104]:
# user_ratings.to_parquet('../data/ratings_groupped_ids.parquet', compression='brotli')

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na pozytywne i neg (z wagami)

In [105]:
# def rating_weight(r):
#     if r >= 4:
#         return 1.0
#     elif r >= 3:
#         return 0.5
#     else:
#         return 0.2
#
# df['weight'] = df['rating'].apply(rating_weight)
#
# pos_df = df[df['rating'] >= 4].groupby('userId')['movieId'].apply(list).reset_index(name='pos')
# neg_df = df[df['rating'] < 4].groupby('userId')['movieId'].apply(list).reset_index(name='neg')
#
# print(pos_df.head())
#
# user_ratings = pd.merge(pos_df, neg_df, on='userId', how='outer')
#
# print(user_ratings.info())
#
# user_ratings['pos'] = user_ratings['pos'].apply(lambda x: x if isinstance(x, list) else [])
# user_ratings['neg'] = user_ratings['neg'].apply(lambda x: x if isinstance(x, list) else [])
#
# user_ratings = user_ratings.reset_index(drop=True)
#
# def attach_weights(row):
#     uid = row['userId']
#     grp = df[df['userId'] == uid]
#     pos_w = [(mid, w) for mid, w in zip(
#         grp.loc[grp['rating'] >= 4, 'movieId'],
#         grp.loc[grp['rating'] >= 4, 'weight']
#     )]
#     neg_w = [(mid, w) for mid, w in zip(
#         grp.loc[grp['rating'] < 4, 'movieId'],
#         grp.loc[grp['rating'] < 4, 'weight']
#     )]
#     return pd.Series({'pos_w': pos_w, 'neg_w': neg_w})
#
# user_ratings = user_ratings.join(
#     user_ratings.apply(attach_weights, axis=1)
# )
#
# print(user_ratings.info())
# print(user_ratings.head())

## Zapis do pliku

In [106]:
# user_ratings.to_parquet('../data/ratings_groupped_ids_weights.parquet', compression='brotli')