In [85]:
import pandas as pd

In [86]:
df = pd.read_csv('../datasets/ratings_improved.csv', parse_dates=['timestamp'])

In [87]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.2 GB
None
   userId  movieId  rating           timestamp  day_of_week
0       1       17     4.0 1999-12-03 19:24:37            4
1       1       25     1.0 1999-12-03 19:43:48            4
2       1       29     2.0 1999-11-22 00:36:16            0
3       1       30     5.0 1999-12-03 19:24:37            4
4       1       32     5.0 1999-11-22 00:00:58            0


# Sprawdzenie czy dane zgadzają się z danymi z pozostałych zbiorów danych

In [88]:
users = pd.read_csv('../datasets/user_features.csv')

def check_with_other_datasets(df, users):
    movies = pd.read_csv('../datasets/Movies_final.csv')

    print('Number of users matches with user_features.csv:', df['userId'].nunique() == users['userId'].nunique())
    print('Number of movies in ratings:', df['movieId'].nunique(), 'Number of movies in Movies_final.csv:', movies['movieId'].nunique())

    missing_movie_ids = df[~df['movieId'].isin(movies['movieId'])]
    print('Number of movies that are in ratings, but are not in Movies_final.csv:', missing_movie_ids['movieId'].nunique())
    
    print('Number of ratings to delete:', missing_movie_ids.shape[0])
    print('Current length:', df.shape)
    df = df[~df['movieId'].isin(missing_movie_ids['movieId'])]
    print('Length after deleting:', df.shape)

    user_counts = missing_movie_ids['userId'].value_counts().reset_index()
    user_counts.columns = ['userId', 'count']

    users_with_deleted_ratings = users[users['userId'].isin(missing_movie_ids['userId'])]
    print(users_with_deleted_ratings.head(3))
    print(len(users_with_deleted_ratings), len(users))
    merged = users_with_deleted_ratings.merge(user_counts, on='userId', how='left')
    merged['count'] = merged['count'].fillna(0)

    merged['remaining_ratings'] = merged['num_rating'] - merged['count']
    filtered = merged[merged['remaining_ratings'] < 20]

    print('Number of users to delete', filtered.shape)
    print('Deleting users that have less than 20 reviews...')
    users = users[~users['userId'].isin(filtered['userId'])]
    
    print('Written to user_features_2.csv')

    return df, users


df, users = check_with_other_datasets(df, users)

Number of users matches with user_features.csv: True
Number of movies in ratings: 84432 Number of movies in Movies_final.csv: 86493
Number of movies that are in ratings, but are not in Movies_final.csv: 1075
Number of ratings to delete: 77897
Current length: (32000204, 5)
Length after deleting: (31922307, 5)
    userId  num_rating  avg_rating  weekend_watcher type_of_viewer  \
9       10         660    2.787121                1       negative   
11      12          23    2.891304                1       negative   
27      28        2842    3.421886                0        neutral   

    genre_Action  genre_Adventure  genre_Animation  genre_Comedy  genre_Crime  \
9       2.820669         2.850622         2.891304      2.577640     2.918803   
11      0.750000         1.400000         2.666667      3.277778     3.000000   
27      3.224090         3.373148         3.897959      3.290830     3.399390   

    ...  genre_History  genre_Horror  genre_Music  genre_Mystery  \
9   ...       2.

# Usunięcie użytkowników, którzy nie mają minimum 1 oceny pozytywnej i negatywnej
Trening User Tower bez wykorzystania Item Tower wymaga aby użytkownik miał co najmniej 1 pozytywną ocenę i jedną negatywną ocenę. Użytkownicy z samymi pozytywnymi/negatywnymi ocenami muszą zostać usunięci.

In [89]:
POSITIVE_THRESHOLD = 4.0

groupped = df.groupby('userId')

valid_users = set()
for userId, group in groupped:
    pos = group[group['rating'] >= POSITIVE_THRESHOLD]
    neg = group[group['rating'] < POSITIVE_THRESHOLD]
    if not pos.empty and not neg.empty:
        valid_users.add(userId)

df = df[df['userId'].isin(valid_users)]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 31820663 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.4 GB
None


# Usunięcie usuniętych użytkowników ze zbioru danych user_features i zapis go do pliku

In [90]:
# Delete users that were removed in the previous notebook cell
users = users[users['userId'].isin(valid_users)]

users.to_csv('../datasets/user_features_2.csv', index=False)

# Zapis do pliku

In [91]:
df.to_parquet('../datasets/ratings_clean.parquet', compression='brotli')

# Standaryzacja ocen oraz timestampów

In [92]:
from sklearn.preprocessing import StandardScaler

# convert date from YYYY-MM-DD format to UNIX timestamp
df['timestamp'] = df['timestamp'].astype('int64')

columns_to_normalize = ['rating', 'timestamp']

scaler = StandardScaler()
normalized = df.copy()
normalized[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

print(df.head(3))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp'] = df['timestamp'].astype('int64')


   userId  movieId  rating           timestamp  day_of_week
0       1       17     4.0  944249077000000000            4
1       1       25     1.0  944250228000000000            4
2       1       29     2.0  943230976000000000            0


# Zgrupowanie danych per usera i zachowanie ostatnich 20 ocen

In [93]:
groupped = normalized.sort_values(['userId', 'timestamp'], ascending=[True, False])
groupped = groupped.groupby('userId').head(20).sort_values(['userId', 'timestamp'])
groupped = groupped.groupby('userId').agg(movies_seq = ('movieId', list), ratings_seq = ('rating', list), ts_seq = ('timestamp', list)).reset_index()

print(groupped.head())
print(groupped.info())

   userId                                         movies_seq  \
0       1  [25, 1041, 1357, 1406, 1693, 307, 1056, 1228, ...   
1       2  [31, 193, 276, 551, 237, 585, 508, 218, 381, 2...   
2       3  [5349, 4896, 5816, 6333, 4016, 6365, 6539, 529...   
3       4  [2683, 2699, 2710, 2770, 223, 2722, 2881, 3203...   
4       5  [231, 316, 161, 292, 318, 329, 434, 10, 185, 2...   

                                         ratings_seq  \
0  [-2.3985095550091526, 1.383608362031683, -2.39...   
1  [1.383608362031683, -0.5074505964887347, 0.438...   
2  [-0.5074505964887347, -0.5074505964887347, -0....   
3  [-0.5074505964887347, -1.4529800757489435, -1....   
4  [-1.4529800757489435, -0.5074505964887347, 0.4...   

                                              ts_seq  
0  [-1.2879172458574584, -1.2879058953379758, -1....  
1  [-1.7090751309183834, -1.7090751309183834, -1....  
2  [-0.7401733328343324, -0.7401733211166591, -0....  
3  [-1.22450412377266, -1.22450412377266, -1.2245...  
4  

# Zapis zgrupowanych danych do pliku

In [94]:
groupped.to_parquet('../datasets/ratings_clean_groupped.parquet', compression='brotli')

# Zgrupowanie danych po userId i rozdzielenie ocenionych filmów na pozytywne i negatywne

In [95]:
pos_df = df[df['rating'] >= 4].groupby('userId')['movieId'].apply(list).reset_index(name='pos')
neg_df = df[df['rating'] < 4].groupby('userId')['movieId'].apply(list).reset_index(name='neg')

print(pos_df.head())

user_ratings = pd.merge(pos_df, neg_df, on='userId', how='outer')

print(user_ratings.info())

user_ratings['pos'] = user_ratings['pos'].apply(lambda x: x if isinstance(x, list) else [])
user_ratings['neg'] = user_ratings['neg'].apply(lambda x: x if isinstance(x, list) else [])

user_ratings = user_ratings.reset_index(drop=True)

print(user_ratings.info())
print(user_ratings.head())

   userId                                                pos
0       1  [17, 30, 32, 80, 111, 166, 176, 232, 260, 302,...
1       2  [31, 34, 39, 48, 185, 186, 207, 216, 218, 222,...
2       3  [10, 11, 17, 26, 62, 110, 150, 151, 161, 260, ...
3       4                [223, 1272, 2115, 2770, 2841, 3175]
4       5  [10, 110, 161, 165, 349, 356, 364, 380, 434, 4...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198921 entries, 0 to 198920
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  198921 non-null  int64 
 1   pos     198921 non-null  object
 2   neg     198921 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198921 entries, 0 to 198920
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  198921 non-null  int64 
 1   pos     198921 non-null  object
 2   neg     198921 non-nu

## Zapis do pliku

In [96]:
user_ratings.to_parquet('../datasets/ratings_groupped_ids.parquet', compression='brotli')