# Wstępne przetworzenie danych odnoszacych sie do ratingow userow oraz ich podzial do user_features

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
df_ratings = pd.read_csv("ratings.csv")

In [6]:
df_ratings["timestamp"] = pd.to_datetime(df_ratings["timestamp"], unit="s")

df_ratings["day_of_week"] = df_ratings["timestamp"].dt.dayofweek
# df_ratings["day"] = df_ratings["timestamp"].dt.day
# df_ratings["month"] = df_ratings["timestamp"].dt.month
# df_ratings["year"] = df_ratings["timestamp"].dt.year
#
# df_ratings.drop(columns=["timestamp"], inplace=True)

In [7]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int32         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(2)
memory usage: 1.1 GB


In [8]:
# Zapisujemy dodatkowo przeksztalcenie time_stamp
df_ratings.to_csv("ratings_improved.csv", index=False)

# Przygotowujemy cechy pod warm-start


In [10]:
df_movies = pd.read_csv(
    "Movies_final_ML.csv",
    usecols=["movieId", "genres"]
)

In [11]:
df_ratings["timestamp"] = pd.to_datetime(df_ratings["timestamp"], unit="s")

df_ratings["day_of_week"] = df_ratings["timestamp"].dt.dayofweek

df_ratings["is_weekend"] = df_ratings["day_of_week"].isin([5, 6]).astype(int)

In [12]:
df_merged_Tr = df_ratings.merge(df_movies, on="movieId", how="left")

df_merged_Tr["genres"] = df_merged_Tr["genres"].fillna("")  # w razie pustych
df_merged_Tr["genres_list"] = df_merged_Tr["genres"].apply(lambda g: g.split(","))

In [13]:
# Grupujemy po userID
df_user_base_Tr = df_merged_Tr.groupby("userId").agg(
    num_rating = ("rating", "count"),
    avg_rating = ("rating", "mean"),
    weekend_count = ("is_weekend", "sum")
).reset_index()

# Weekend_watcher = 1 jeśli > 50% ocen w weekend
df_user_base_Tr["weekend_ratio"] = df_user_base_Tr["weekend_count"] / df_user_base_Tr["num_rating"]
df_user_base_Tr["weekend_watcher"] = (df_user_base_Tr["weekend_ratio"] > 0.5).astype(int)

df_user_base_Tr.drop(columns=["weekend_ratio", "weekend_count"], inplace=True)


In [14]:
bins = [0, 3, 4, 5]
labels = ["negative", "neutral", "positive"]
df_user_base_Tr["type_of_viewer"] = pd.cut(
    df_user_base_Tr["avg_rating"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

group_counts_Tr = df_user_base_Tr["type_of_viewer"].value_counts()

In [15]:
df_exploded_Tr = df_merged_Tr.explode("genres_list")

#Pozwoli zapobiec stworzeniu 2 kolumn, np. genre_Action i genre_ Action
df_exploded_Tr["genres_list"] = df_exploded_Tr["genres_list"].str.strip()

df_exploded_Tr = df_exploded_Tr[df_exploded_Tr["genres_list"] != ""]

df_genre_user_Tr = df_exploded_Tr.groupby(["userId", "genres_list"])["rating"].mean().reset_index()
df_genre_user_Tr.rename(columns={"genres_list": "genre", "rating": "avg_rating_genre"}, inplace=True)

df_genre_pivot_Tr = df_genre_user_Tr.pivot(
    index="userId",
    columns="genre",
    values="avg_rating_genre"
).reset_index()

df_genre_pivot_Tr.columns = ["userId"] + ["genre_" + str(col) for col in df_genre_pivot_Tr.columns if col != "userId"]
df_genre_pivot_Tr.head()


Unnamed: 0,userId,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_History,genre_Horror,genre_Musical,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,1,4.133333,3.333333,,3.339623,4.285714,3.0,3.61,2.666667,2.5,3.526316,,4.0,4.272727,3.487805,4.0,,3.666667,3.555556,3.0
1,2,4.0,4.142857,4.875,4.130435,4.0,,4.615385,4.692308,4.111111,,1.0,,4.75,4.823529,,,4.3,3.0,
2,3,3.583333,3.605263,3.961538,3.186275,3.346154,,4.105263,3.833333,3.0,4.875,1.5,4.25,3.875,3.637931,3.441176,,3.307692,4.0,3.75
3,4,2.454545,3.0,,3.0,2.166667,,2.222222,,3.0,2.5,2.375,,2.5,2.0,2.75,,2.142857,2.5,
4,5,3.666667,3.333333,3.666667,3.0,2.625,,3.285714,3.5,3.0,3.5,2.0,,3.333333,3.25,3.2,,3.466667,4.0,3.0


In [17]:
df_user_features_Tr = df_user_base_Tr.merge(df_genre_pivot_Tr, on="userId", how="left")

df_user_features_Tr.to_csv("../data/user_features_warm.csv", index=False)

In [18]:
df_user_features_Tr.info()
df_user_features_Tr.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   userId                 200948 non-null  int64   
 1   num_rating             200948 non-null  int64   
 2   avg_rating             200948 non-null  float64 
 3   weekend_watcher        200948 non-null  int32   
 4   type_of_viewer         200948 non-null  category
 5   genre_Action           199710 non-null  float64 
 6   genre_Adventure        199973 non-null  float64 
 7   genre_Animation        174025 non-null  float64 
 8   genre_Comedy           200478 non-null  float64 
 9   genre_Crime            198776 non-null  float64 
 10  genre_Documentary      73349 non-null   float64 
 11  genre_Drama            200882 non-null  float64 
 12  genre_Family           188525 non-null  float64 
 13  genre_Fantasy          196391 non-null  float64 
 14  genre_History       

Unnamed: 0,userId,num_rating,avg_rating,weekend_watcher,type_of_viewer,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,...,genre_History,genre_Horror,genre_Musical,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,1,141,3.531915,0,neutral,4.133333,3.333333,,3.339623,4.285714,...,3.526316,,4.0,4.272727,3.487805,4.0,,3.666667,3.555556,3.0
1,2,52,4.269231,0,positive,4.0,4.142857,4.875,4.130435,4.0,...,,1.0,,4.75,4.823529,,,4.3,3.0,
2,3,147,3.588435,0,neutral,3.583333,3.605263,3.961538,3.186275,3.346154,...,4.875,1.5,4.25,3.875,3.637931,3.441176,,3.307692,4.0,3.75
3,4,27,2.62963,0,negative,2.454545,3.0,,3.0,2.166667,...,2.5,2.375,,2.5,2.0,2.75,,2.142857,2.5,
4,5,33,3.272727,0,neutral,3.666667,3.333333,3.666667,3.0,2.625,...,3.5,2.0,,3.333333,3.25,3.2,,3.466667,4.0,3.0
