In [52]:
import pandas as pd

In [53]:
df = pd.read_csv('../datasets/ratings_improved.csv')

In [54]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   timestamp    object 
 4   day_of_week  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 1.2+ GB
None
   userId  movieId  rating            timestamp  day_of_week
0       1       17     4.0  1999-12-03 19:24:37            4
1       1       25     1.0  1999-12-03 19:43:48            4
2       1       29     2.0  1999-11-22 00:36:16            0
3       1       30     5.0  1999-12-03 19:24:37            4
4       1       32     5.0  1999-11-22 00:00:58            0


# Sprawdzenie brakujących wartości
The dataset doesn't have any rows/columns with missing data.

In [55]:
df.isna().sum()

userId         0
movieId        0
rating         0
timestamp      0
day_of_week    0
dtype: int64

# Sprawdzenie duplikatów

In [56]:
df[df.duplicated()]

Unnamed: 0,userId,movieId,rating,timestamp,day_of_week


# Sprawdzenie czy dane zgadzają się z danymi z pozostałych zbiorów danych

In [57]:
def check_with_other_datasets(df):
    movies = pd.read_csv('../datasets/Movies_final.csv')
    users = pd.read_csv('../datasets/user_features.csv')

    print('Number of users matches with user_features.csv:', df['userId'].nunique() == users['userId'].nunique())
    print('Number of movies in ratings:', df['movieId'].nunique(), 'Number of movies in Movies_final.csv:', movies['movieId'].nunique())

    missing_movie_ids = df[~df['movieId'].isin(movies['movieId'])]
    print('Number of movies that are in ratings, but are not in Movies_final.csv:', missing_movie_ids['movieId'].nunique())
    
    print('Number of ratings to delete:', missing_movie_ids.shape[0])
    print('Current length:', df.shape)
    df = df[~df['movieId'].isin(missing_movie_ids['movieId'])]
    print('Length after deleting:', df.shape)

    user_counts = missing_movie_ids['userId'].value_counts().reset_index()
    user_counts.columns = ['userId', 'count']

    users_with_deleted_ratings = users[users['userId'].isin(missing_movie_ids['userId'])]
    print(users_with_deleted_ratings.head(3))
    print(len(users_with_deleted_ratings), len(users))
    merged = users_with_deleted_ratings.merge(user_counts, on='userId', how='left')
    merged['count'] = merged['count'].fillna(0)

    merged['remaining_ratings'] = merged['num_rating'] - merged['count']
    filtered = merged[merged['remaining_ratings'] < 20]

    print('Number of users to delete', filtered.shape)
    print('Deleting users that have less than 20 reviews...')
    users = users[~users['userId'].isin(filtered['userId'])]

    users.to_csv('../datasets/user_features_2.csv')
    print('Written to user_features_2.csv')

    return df


df = check_with_other_datasets(df)

Number of users matches with user_features.csv: True
Number of movies in ratings: 84432 Number of movies in Movies_final.csv: 86493
Number of movies that are in ratings, but are not in Movies_final.csv: 1075
Number of ratings to delete: 77897
Current length: (32000204, 5)
Length after deleting: (31922307, 5)
    userId  num_rating  avg_rating  weekend_watcher type_of_viewer  \
9       10         660    2.787121                1       negative   
11      12          23    2.891304                1       negative   
27      28        2842    3.421886                0        neutral   

    genre_Action  genre_Adventure  genre_Animation  genre_Comedy  genre_Crime  \
9       2.820669         2.850622         2.891304      2.577640     2.918803   
11      0.750000         1.400000         2.666667      3.277778     3.000000   
27      3.224090         3.373148         3.897959      3.290830     3.399390   

    ...  genre_History  genre_Horror  genre_Music  genre_Mystery  \
9   ...       2.