# Wstępne przetworzenie danych odnoszacych sie do ratingow userow oraz ich podzial do user_features

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [12]:
df_ratings = pd.read_csv("ratings.csv")

In [13]:
df_ratings["timestamp"] = pd.to_datetime(df_ratings["timestamp"], unit="s")

df_ratings["day_of_week"] = df_ratings["timestamp"].dt.dayofweek
# df_ratings["day"] = df_ratings["timestamp"].dt.day
# df_ratings["month"] = df_ratings["timestamp"].dt.month
# df_ratings["year"] = df_ratings["timestamp"].dt.year
#
# df_ratings.drop(columns=["timestamp"], inplace=True)

In [14]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   userId       int64         
 1   movieId      int64         
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   day_of_week  int32         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(2)
memory usage: 1.1 GB


In [15]:
# Zapisujemy dodatkowo przeksztalcenie time_stamp
df_ratings.to_csv("ratings_improved.csv", index=False)

# Przygotowujemy cechy odpowiednio dla train i test


In [None]:
train_users, test_users = train_test_split(df_ratings['userId'].unique(), test_size=0.2, random_state=42)
df_train_ratings = df_ratings[df_ratings['userId'].isin(train_users)]
df_test_ratings  = df_ratings[df_ratings['userId'].isin(test_users)]

In [17]:
df_movies = pd.read_csv(
    "Movies_final_ML.csv",
    usecols=["movieId", "genres"]
)

## Train

In [None]:
df_train_ratings["timestamp"] = pd.to_datetime(df_train_ratings["timestamp"], unit="s")

df_train_ratings["day_of_week"] = df_train_ratings["timestamp"].dt.dayofweek

df_train_ratings["is_weekend"] = df_train_ratings["day_of_week"].isin([5, 6]).astype(int)

In [None]:
df_merged_Tr = df_train_ratings(df_movies, on="movieId", how="left")

df_merged_Tr["genres"] = df_merged_Tr["genres"].fillna("")  # w razie pustych
df_merged_Tr["genres_list"] = df_merged_Tr["genres"].apply(lambda g: g.split(","))

In [None]:
# Grupujemy po userID
df_user_base_Tr = df_merged_Tr.groupby("userId").agg(
    num_rating = ("rating", "count"),
    avg_rating = ("rating", "mean"),
    weekend_count = ("is_weekend", "sum")
).reset_index()

# Weekend_watcher = 1 jeśli > 50% ocen w weekend
df_user_base_Tr["weekend_ratio"] = df_user_base_Tr["weekend_count"] / df_user_base_Tr["num_rating"]
df_user_base_Tr["weekend_watcher"] = (df_user_base_Tr["weekend_ratio"] > 0.5).astype(int)

df_user_base_Tr.drop(columns=["weekend_ratio", "weekend_count"], inplace=True)


In [None]:
bins = [0, 3, 4, 5]
labels = ["negative", "neutral", "positive"]
df_user_base_Tr["type_of_viewer"] = pd.cut(
    df_user_base_Tr["avg_rating"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

group_counts_Tr = df_user_base_Tr["type_of_viewer"].value_counts()

In [None]:
df_exploded_Tr = df_merged_Tr.explode("genres_list")

#Pozwoli zapobiec stworzeniu 2 kolumn, np. genre_Action i genre_ Action
df_exploded_Tr["genres_list"] = df_exploded_Tr["genres_list"].str.strip()

df_exploded_Tr = df_exploded_Tr[df_exploded_Tr["genres_list"] != ""]

df_genre_user_Tr = df_exploded_Tr.groupby(["userId", "genres_list"])["rating"].mean().reset_index()
df_genre_user_Tr.rename(columns={"genres_list": "genre", "rating": "avg_rating_genre"}, inplace=True)

df_genre_pivot_Tr = df_genre_user_Tr.pivot(
    index="userId",
    columns="genre",
    values="avg_rating_genre"
).reset_index()

df_genre_pivot_Tr.columns = ["userId"] + ["genre_" + str(col) for col in df_genre_pivot_Tr.columns if col != "userId"]
df_genre_pivot_Tr.head()


In [None]:
df_user_features_Tr = df_user_base_Tr.merge(df_genre_pivot_Tr, on="userId", how="left")

df_user_features_Tr.to_csv("user_features_train.csv", index=False)

## Test

In [18]:
df_test_ratings["timestamp"] = pd.to_datetime(df_test_ratings["timestamp"], unit="s")

df_test_ratings["day_of_week"] = df_test_ratings["timestamp"].dt.dayofweek

df_test_ratings["is_weekend"] = df_test_ratings["day_of_week"].isin([5, 6]).astype(int)

In [19]:
df_merged_Te = df_test_ratings.merge(df_movies, on="movieId", how="left")

df_merged_Te["genres"] = df_merged_Te["genres"].fillna("")  # w razie pustych
df_merged_Te["genres_list"] = df_merged_Te["genres"].apply(lambda g: g.split(","))


In [20]:
# Grupujemy po userID
df_user_base_Te = df_merged_Te.groupby("userId").agg(
    num_rating = ("rating", "count"),
    avg_rating = ("rating", "mean"),
    weekend_count = ("is_weekend", "sum")  
).reset_index()

# Weekend_watcher = 1 jeśli > 50% ocen w weekend
df_user_base_Te["weekend_ratio"] = df_user_base_Te["weekend_count"] / df_user_base_Te["num_rating"]
df_user_base_Te["weekend_watcher"] = (df_user_base_Te["weekend_ratio"] > 0.5).astype(int)

df_user_base_Te.drop(columns=["weekend_ratio", "weekend_count"], inplace=True)


In [21]:
bins = [0, 3, 4, 5]
labels = ["negative", "neutral", "positive"]
df_user_base_Te["type_of_viewer"] = pd.cut(
    df_user_base_Te["avg_rating"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

group_counts_Te = df_user_base_Te["type_of_viewer"].value_counts()

In [22]:
df_exploded_Te = df_merged_Te.explode("genres_list")

#Pozwoli zapobiec stworzeniu 2 kolumn, np. genre_Action i genre_ Action
df_exploded_Te["genres_list"] = df_exploded_Te["genres_list"].str.strip()

df_exploded_Te = df_exploded_Te[df_exploded_Te["genres_list"] != ""]

df_genre_user_Te = df_exploded_Te.groupby(["userId", "genres_list"])["rating"].mean().reset_index()
df_genre_user_Te.rename(columns={"genres_list": "genre", "rating": "avg_rating_genre"}, inplace=True)

df_genre_pivot_Te = df_genre_user_Te.pivot(
    index="userId",
    columns="genre",
    values="avg_rating_genre"
).reset_index()

df_genre_pivot_Te.columns = ["userId"] + ["genre_" + str(col) for col in df_genre_pivot_Te.columns if col != "userId"]
df_genre_pivot_Te.head()


Unnamed: 0,userId,genre_(no genres listed),genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,...,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,1,,4.133333,3.333333,,,3.339623,4.285714,3.0,3.61,...,4.0,,4.272727,3.487805,,4.0,,3.666667,3.555556,3.0
1,2,,4.0,4.142857,4.875,,4.130435,4.0,,4.615385,...,,,4.75,4.823529,,,,4.3,3.0,
2,3,,3.583333,3.605263,3.961538,,3.186275,3.346154,,4.105263,...,4.25,,3.875,3.637931,,3.441176,,3.307692,4.0,3.75
3,4,,2.454545,3.0,,,3.0,2.166667,,2.222222,...,,,2.5,2.0,,2.75,,2.142857,2.5,
4,5,,3.666667,3.333333,3.666667,,3.0,2.625,,3.285714,...,,,3.333333,3.25,,3.2,,3.466667,4.0,3.0


In [23]:
df_user_features_Te = df_user_base_Te.merge(df_genre_pivot_Te, on="userId", how="left")

df_user_features_Te.to_csv("user_features_test.csv", index=False)
