In [37]:
import pandas as pd

In [38]:
df_ratings = pd.read_csv("ratings.csv")

In [39]:
df_ratings["timestamp"] = pd.to_datetime(df_ratings["timestamp"], unit="s")

df_ratings["day_of_week"] = df_ratings["timestamp"].dt.dayofweek
# df_ratings["day"] = df_ratings["timestamp"].dt.day
# df_ratings["month"] = df_ratings["timestamp"].dt.month
# df_ratings["year"] = df_ratings["timestamp"].dt.year
# 
# df_ratings.drop(columns=["timestamp"], inplace=True)

In [40]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int32         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(2)
memory usage: 1.1 GB


In [41]:
# df_ratings.to_csv("ratings_improved.csv", index=False)

In [42]:
df_movies = pd.read_csv(
    "Movies_final.csv",
    usecols=["movieId", "genres"]
)

In [43]:
df_ratings["timestamp"] = pd.to_datetime(df_ratings["timestamp"], unit="s")

df_ratings["day_of_week"] = df_ratings["timestamp"].dt.dayofweek

df_ratings["is_weekend"] = df_ratings["day_of_week"].isin([5, 6]).astype(int)

In [44]:
df_merged = df_ratings.merge(df_movies, on="movieId", how="left")

df_merged["genres"] = df_merged["genres"].fillna("")  # w razie pustych
df_merged["genres_list"] = df_merged["genres"].apply(lambda g: g.split(","))


In [45]:
# Grupujemy po userID
df_user_base = df_merged.groupby("userId").agg(
    num_rating = ("rating", "count"),
    avg_rating = ("rating", "mean"),
    weekend_count = ("is_weekend", "sum")  
).reset_index()

# Weekend_watcher = 1 jeśli > 50% ocen w weekend
df_user_base["weekend_ratio"] = df_user_base["weekend_count"] / df_user_base["num_rating"]
df_user_base["weekend_watcher"] = (df_user_base["weekend_ratio"] > 0.5).astype(int)

df_user_base.drop(columns=["weekend_ratio", "weekend_count"], inplace=True)


In [46]:
bins = [0, 3, 4, 5]
labels = ["negative", "neutral", "positive"]
df_user_base["type_of_viewer"] = pd.cut(
    df_user_base["avg_rating"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

group_counts = df_user_base["type_of_viewer"].value_counts()

In [47]:
df_exploded = df_merged.explode("genres_list")

df_exploded = df_exploded[df_exploded["genres_list"] != ""]

df_genre_user = df_exploded.groupby(["userId", "genres_list"])["rating"].mean().reset_index()
df_genre_user.rename(columns={"genres_list": "genre", "rating": "avg_rating_genre"}, inplace=True)

df_genre_pivot = df_genre_user.pivot(
    index="userId",
    columns="genre",
    values="avg_rating_genre"
).reset_index()

df_genre_pivot.columns = ["userId"] + ["genre_" + str(col) for col in df_genre_pivot.columns if col != "userId"]
df_genre_pivot.head()


Unnamed: 0,userId,genre_ Action,genre_ Adventure,genre_ Animation,genre_ Comedy,genre_ Crime,genre_ Documentary,genre_ Drama,genre_ Family,genre_ Fantasy,...,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,1,4.0,4.0,,3.35,5.0,,3.846154,2.666667,2.2,...,4.0,,4.0,5.0,4.0,5.0,,2.666667,3.5,3.0
1,2,4.5,4.0,4.833333,4.428571,3.857143,,4.533333,4.7,3.5,...,,1.0,,,5.0,,,3.0,,
2,3,3.661765,3.56,4.0,3.227273,3.444444,,3.903226,3.842105,3.0,...,,1.5,4.0,,3.285714,3.375,,4.0,,
3,4,2.625,4.0,,2.666667,2.2,,2.25,,3.0,...,,2.8,,,2.0,,,1.666667,4.0,
4,5,3.5,3.2,4.0,3.0,2.25,,3.454545,3.0,2.8,...,,2.0,,,3.0,3.0,,2.5,,


In [48]:
df_user_features = df_user_base.merge(df_genre_pivot, on="userId", how="left")

df_user_features.to_csv("user_features.csv", index=False)
