In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/ratings_improved.csv', parse_dates=['timestamp'])

In [3]:
users_train = pd.read_csv('../data/user_features_train.csv')
users_test = pd.read_csv('../data/user_features_test.csv')

In [4]:
'''
Dzielimy odpowiednio na zbior ratingow w zbiorze treningowym i testowym
'''
train_ratings = df[df['userId'].isin(users_train['userId'])]
val_ratings = df[df['userId'].isin(users_test ['userId'])]

In [5]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.2 GB
None
   userId  movieId  rating           timestamp  day_of_week
0       1       17     4.0 1999-12-03 19:24:37            4
1       1       25     1.0 1999-12-03 19:43:48            4
2       1       29     2.0 1999-11-22 00:36:16            0
3       1       30     5.0 1999-12-03 19:24:37            4
4       1       32     5.0 1999-11-22 00:00:58            0


# Sprawdzenie czy dane zgadzają się z danymi z pozostałych zbiorów danych

In [6]:
def check_with_other_datasets(df, users):
    movies = pd.read_csv('../data/Movies_final_ML.csv')

    print('Number of users matches with user_features.csv:', df['userId'].nunique() == users['userId'].nunique())
    print('Number of movies in ratings:', df['movieId'].nunique(), 'Number of movies in Movies_final.csv:', movies['movieId'].nunique())

    missing_movie_ids = df[~df['movieId'].isin(movies['movieId'])]
    print('Number of movies that are in ratings, but are not in Movies_final.csv:', missing_movie_ids['movieId'].nunique())
    
    print('Number of ratings to delete:', missing_movie_ids.shape[0])
    print('Current length:', df.shape)
    df = df[~df['movieId'].isin(missing_movie_ids['movieId'])]
    print('Length after deleting:', df.shape)

    user_counts = missing_movie_ids['userId'].value_counts().reset_index()
    user_counts.columns = ['userId', 'count']

    users_with_deleted_ratings = users[users['userId'].isin(missing_movie_ids['userId'])]
    print(users_with_deleted_ratings.head(3))
    print(len(users_with_deleted_ratings), len(users))
    merged = users_with_deleted_ratings.merge(user_counts, on='userId', how='left')
    merged['count'] = merged['count'].fillna(0)

    merged['remaining_ratings'] = merged['num_rating'] - merged['count']
    filtered = merged[merged['remaining_ratings'] < 20]

    print('Number of users to delete', filtered.shape)
    print('Deleting users that have less than 20 reviews...')
    users = users[~users['userId'].isin(filtered['userId'])]
    df = df[~df['userId'].isin(filtered['userId'])]
    
    print('Written to user_features_2.csv')

    return df, users


df, users_train = check_with_other_datasets(df, users_train)
df, users_test = check_with_other_datasets(df, users_test)

Number of users matches with user_features.csv: False
Number of movies in ratings: 84432 Number of movies in Movies_final.csv: 87585
Number of movies that are in ratings, but are not in Movies_final.csv: 0
Number of ratings to delete: 0
Current length: (32000204, 5)
Length after deleting: (32000204, 5)
Empty DataFrame
Columns: [userId, num_rating, avg_rating, weekend_watcher, type_of_viewer, genre_Action, genre_Adventure, genre_Animation, genre_Comedy, genre_Crime, genre_Documentary, genre_Drama, genre_Family, genre_Fantasy, genre_History, genre_Horror, genre_Musical, genre_Mystery, genre_Romance, genre_Science Fiction, genre_TV Movie, genre_Thriller, genre_War, genre_Western]
Index: []

[0 rows x 24 columns]
0 160758
Number of users to delete (0, 26)
Deleting users that have less than 20 reviews...
Written to user_features_2.csv
Number of users matches with user_features.csv: False
Number of movies in ratings: 84432 Number of movies in Movies_final.csv: 87585
Number of movies that are

# Usunięcie użytkowników, którzy nie mają minimum 20 pozytywnych ocen (neg beda dobieranie z nie-ogladanych)

In [7]:
POSITIVE_THRESHOLD = 4
MIN_POS = 20

pos_counts = (
    df
    .groupby('userId')['rating']
    .apply(lambda x: (x >= POSITIVE_THRESHOLD).sum())
)

valid_users = pos_counts[pos_counts >= MIN_POS].index

In [8]:
total_users = pos_counts.shape[0]
kept_users = valid_users.size
removed_users = total_users - kept_users

print(f'Total users      : {total_users}')
print(f'Users kept       : {kept_users}')
print(f'Users removed    : {removed_users} '
      f'({removed_users/total_users*100:.2f}%)')

Total users      : 200948
Users kept       : 157023
Users removed    : 43925 (21.86%)


In [9]:
df = df[df['userId'].isin(valid_users)].copy()

print('\nAfter filtering:')
print(df.info())


After filtering:
<class 'pandas.core.frame.DataFrame'>
Index: 30594246 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.4 GB
None


# Usunięcie użytkowników, którzy nie mają minimum 1 oceny pozytywnej i negatywnej
Trening User Tower bez wykorzystania Item Tower wymaga aby użytkownik miał co najmniej 1 pozytywną ocenę i jedną negatywną ocenę. Użytkownicy z samymi pozytywnymi/negatywnymi ocenami muszą zostać usunięci.

In [10]:
# POSITIVE_THRESHOLD = 4
#
# groupped = df.groupby('userId')
#
# valid_users = set()
# for userId, group in groupped:
#     pos = group[group['rating'] >= POSITIVE_THRESHOLD]
#     neg = group[group['rating'] < POSITIVE_THRESHOLD]
#     if not pos.empty and not neg.empty:
#         valid_users.add(userId)
#
# df = df[df['userId'].isin(valid_users)]
# print(df.info())

# Usunięcie usuniętych użytkowników ze zbioru danych user_features i zapis go do pliku

In [11]:
users_Tr = users_train[users_train['userId'].isin(valid_users)]

users_Tr.to_csv('../data/user_features_train_2.csv', index=False)

In [12]:
users_Te = users_test[users_test['userId'].isin(valid_users)]

users_Te.to_csv('../data/user_features_test_2.csv', index=False)

# Zapis do pliku

In [13]:
df.to_parquet('../data/ratings_clean.parquet', compression='brotli')

In [14]:
print(df.head())

   userId  movieId  rating           timestamp  day_of_week
0       1       17     4.0 1999-12-03 19:24:37            4
1       1       25     1.0 1999-12-03 19:43:48            4
2       1       29     2.0 1999-11-22 00:36:16            0
3       1       30     5.0 1999-12-03 19:24:37            4
4       1       32     5.0 1999-11-22 00:00:58            0


# Zapisanie zgrupowanych danych bez normalizacji

In [15]:
groupped = df.sort_values(['userId', 'timestamp'], ascending=[True, False])
groupped = groupped.groupby('userId').head(20).sort_values(['userId', 'timestamp'])
groupped = groupped.groupby('userId').agg(movies_seq = ('movieId', list), ratings_seq = ('rating', list), ts_seq = ('timestamp', list)).reset_index()

print(groupped.head())
print(groupped.info())

# Sprawdzenie czy wszystkie movies_seq mają minimum 20 filmów
assert groupped['movies_seq'].apply(len).eq(20).all()

groupped.to_parquet('../data/ratings_clean_groupped_20.parquet', compression='brotli')

   userId                                         movies_seq  \
0       1  [25, 1041, 1357, 1406, 1693, 307, 1056, 1228, ...   
1       2  [31, 193, 276, 551, 237, 585, 508, 218, 381, 2...   
2       3  [5349, 4896, 5816, 6333, 4016, 6365, 6539, 529...   
3       7  [19, 21, 34, 367, 410, 435, 500, 586, 39, 432,...   
4       8  [296, 79132, 58559, 260, 912, 2858, 2329, 6016...   

                                         ratings_seq  \
0  [1.0, 5.0, 1.0, 2.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...   
1  [5.0, 3.0, 4.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0, ...   
2  [3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 3.0, 5.0, 4.0, ...   
3  [3.0, 5.0, 4.0, 3.0, 4.0, 2.0, 4.0, 3.0, 3.0, ...   
4  [4.5, 4.5, 4.0, 4.0, 4.0, 3.5, 4.5, 4.5, 5.0, ...   

                                              ts_seq  
0  [1999-12-03 19:43:48, 1999-12-03 20:32:14, 199...  
1  [1996-07-03 19:58:22, 1996-07-03 19:58:22, 199...  
2  [2004-05-13 21:59:15, 2004-05-13 21:59:18, 200...  
3  [1996-08-18 16:32:12, 1996-08-18 16:32:12, 199...  
4  

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na tylko pozytywne

In [19]:
pos_s = (df.loc[df['rating'] >= 4, ['userId', 'movieId']].groupby('userId')['movieId'].agg(list).rename('pos'))
seen_s = (df.groupby('userId')['movieId'].agg(lambda x: set(x)).rename('seen'))

user_ratings = (seen_s.to_frame().join(pos_s.to_frame()))

# user_ratings['pos'] = user_ratings['pos'].apply(
#     lambda v: v if isinstance(v, list) else []
# )

user_ratings = user_ratings.reset_index()
user_ratings = user_ratings.rename(columns={'index': 'userId'})

assert 'userId' in user_ratings.columns
assert user_ratings[['seen', 'pos']].applymap(type).isin({set, list}).all().all()
print(user_ratings.dtypes)
print(user_ratings.head())


userId     int64
seen      object
pos       object
dtype: object
   userId                                               seen  \
0       1  {3078, 527, 2064, 17, 1041, 2067, 3088, 25, 29...   
1       2  {520, 783, 276, 153, 282, 539, 31, 34, 39, 296...   
2       3  {2, 1031, 1544, 10, 11, 527, 17, 534, 26, 539,...   
3       7  {265, 19, 531, 21, 150, 153, 410, 539, 541, 28...   
4       8  {6016, 4226, 260, 44555, 55820, 527, 912, 2959...   

                                                 pos  
0  [17, 30, 32, 80, 111, 166, 176, 232, 260, 302,...  
1  [31, 34, 39, 48, 185, 186, 207, 216, 218, 222,...  
2  [10, 11, 17, 26, 62, 110, 150, 151, 161, 260, ...  
3  [21, 34, 150, 162, 208, 235, 265, 292, 296, 31...  
4  [32, 47, 260, 296, 527, 593, 608, 858, 912, 17...  


  assert user_ratings[['seen', 'pos']].applymap(type).isin({set, list}).all().all()


In [20]:
user_ratings.info()
user_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157023 entries, 0 to 157022
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  157023 non-null  int64 
 1   seen    157023 non-null  object
 2   pos     157023 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.6+ MB


Unnamed: 0,userId,seen,pos
0,1,"{3078, 527, 2064, 17, 1041, 2067, 3088, 25, 29...","[17, 30, 32, 80, 111, 166, 176, 232, 260, 302,..."
1,2,"{520, 783, 276, 153, 282, 539, 31, 34, 39, 296...","[31, 34, 39, 48, 185, 186, 207, 216, 218, 222,..."
2,3,"{2, 1031, 1544, 10, 11, 527, 17, 534, 26, 539,...","[10, 11, 17, 26, 62, 110, 150, 151, 161, 260, ..."
3,7,"{265, 19, 531, 21, 150, 153, 410, 539, 541, 28...","[21, 34, 150, 162, 208, 235, 265, 292, 296, 31..."
4,8,"{6016, 4226, 260, 44555, 55820, 527, 912, 2959...","[32, 47, 260, 296, 527, 593, 608, 858, 912, 17..."
5,9,"{3328, 3972, 33672, 1673, 3081, 8957, 2700, 50...","[32, 47, 50, 223, 293, 318, 353, 1079, 1089, 1..."
6,10,"{169984, 1, 2, 122882, 45062, 122886, 10, 6964...","[10, 47, 50, 110, 180, 223, 260, 296, 318, 344..."
7,13,"{6, 1674, 11, 1036, 2058, 2571, 527, 912, 920,...","[6, 110, 161, 225, 318, 356, 380, 457, 475, 52..."
8,14,"{33410, 5768, 4619, 7437, 4622, 7706, 4510, 23...","[437, 1238, 1903, 2246, 2261, 2335, 2918, 3804..."
9,15,"{3079, 1032, 17, 18, 5668, 74789, 39, 56367, 1...","[17, 147, 357, 500, 587, 597, 909, 1032, 1079,..."


## Zapis do pliku

In [21]:
user_ratings.to_parquet('../data/ratings_groupped_20pos.parquet', compression='brotli')

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na pozytywne i negatywne

In [17]:
# pos_df = df[df['rating'] >= 4].groupby('userId')['movieId'].apply(list).reset_index(name='pos')
# neg_df = df[df['rating'] < 4].groupby('userId')['movieId'].apply(list).reset_index(name='neg')
#
# print(pos_df.head())
#
# user_ratings = pd.merge(pos_df, neg_df, on='userId', how='outer')
#
# print(user_ratings.info())
#
# user_ratings['pos'] = user_ratings['pos'].apply(lambda x: x if isinstance(x, list) else [])
# user_ratings['neg'] = user_ratings['neg'].apply(lambda x: x if isinstance(x, list) else [])
#
# user_ratings = user_ratings.reset_index(drop=True)
#
# print(user_ratings.info())
# print(user_ratings.head())

   userId                                                pos
0       1  [17, 30, 32, 80, 111, 166, 176, 232, 260, 302,...
1       2  [31, 34, 39, 48, 185, 186, 207, 216, 218, 222,...
2       3  [10, 11, 17, 26, 62, 110, 150, 151, 161, 260, ...
3       4                [223, 1272, 2115, 2770, 2841, 3175]
4       5  [10, 110, 161, 165, 349, 356, 364, 380, 434, 4...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198928 entries, 0 to 198927
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  198928 non-null  int64 
 1   pos     198928 non-null  object
 2   neg     198928 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198928 entries, 0 to 198927
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  198928 non-null  int64 
 1   pos     198928 non-null  object
 2   neg     198928 non-nu

## Zapis do pliku

In [18]:
# user_ratings.to_parquet('../data/ratings_groupped_ids.parquet', compression='brotli')

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na pozytywne i neg (z wagami)

In [None]:
# def rating_weight(r):
#     if r >= 4:
#         return 1.0
#     elif r >= 3:
#         return 0.5
#     else:
#         return 0.2
#
# df['weight'] = df['rating'].apply(rating_weight)
#
# pos_df = df[df['rating'] >= 4].groupby('userId')['movieId'].apply(list).reset_index(name='pos')
# neg_df = df[df['rating'] < 4].groupby('userId')['movieId'].apply(list).reset_index(name='neg')
#
# print(pos_df.head())
#
# user_ratings = pd.merge(pos_df, neg_df, on='userId', how='outer')
#
# print(user_ratings.info())
#
# user_ratings['pos'] = user_ratings['pos'].apply(lambda x: x if isinstance(x, list) else [])
# user_ratings['neg'] = user_ratings['neg'].apply(lambda x: x if isinstance(x, list) else [])
#
# user_ratings = user_ratings.reset_index(drop=True)
#
# def attach_weights(row):
#     uid = row['userId']
#     grp = df[df['userId'] == uid]
#     pos_w = [(mid, w) for mid, w in zip(
#         grp.loc[grp['rating'] >= 4, 'movieId'],
#         grp.loc[grp['rating'] >= 4, 'weight']
#     )]
#     neg_w = [(mid, w) for mid, w in zip(
#         grp.loc[grp['rating'] < 4, 'movieId'],
#         grp.loc[grp['rating'] < 4, 'weight']
#     )]
#     return pd.Series({'pos_w': pos_w, 'neg_w': neg_w})
#
# user_ratings = user_ratings.join(
#     user_ratings.apply(attach_weights, axis=1)
# )
#
# print(user_ratings.info())
# print(user_ratings.head())

   userId                                                pos
0       1  [17, 30, 32, 80, 111, 166, 176, 232, 260, 302,...
1       2  [31, 34, 39, 48, 185, 186, 207, 216, 218, 222,...
2       3  [10, 11, 17, 26, 62, 110, 150, 151, 161, 260, ...
3       4                [223, 1272, 2115, 2770, 2841, 3175]
4       5  [10, 110, 161, 165, 349, 356, 364, 380, 434, 4...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198928 entries, 0 to 198927
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  198928 non-null  int64 
 1   pos     198928 non-null  object
 2   neg     198928 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.6+ MB
None


## Zapis do pliku

In [21]:
# user_ratings.to_parquet('../data/ratings_groupped_ids_weights.parquet', compression='brotli')