In [32]:
import sys
import importlib.util


from google.colab import drive
drive.mount('/content/drive')

PROJECT_ROOT = "/content/drive/MyDrive/Projects/GitHub/ClusterNetflix"
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

config_path = f"{PROJECT_ROOT}/config.py"
spec = importlib.util.spec_from_file_location("config", config_path)
config = importlib.util.module_from_spec(spec)
spec.loader.exec_module(config)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

In [47]:
file_path = f"{config.DATA_PATH}/netflix_incomplete.txt"
df = pd.read_csv(file_path, sep=' ', header=None)
df = df.replace(0, np.nan)
n, d = df.shape
df.columns = [f"c_{i}" for i in range(d)]

df_copy = df.copy()

In [None]:
###### Calling main function - THE FINAL STEP
# main(df_copy)

In [48]:
# df_ = df_copy
def main(df_):
    X_scaled = apply_data_modifications(df_)

    # k_range = range(1, 15)
    # aics, bics = get_aic_bic(k_range, X_scaled)
    # plot_aic_bic(k_range, aics, bics)

    k_chosen = 9 # 11 is based on aic, 9 is based on bic
    add_cluster_labels(df_, k_chosen, X_scaled)
    impute_cluster_mean(df_)

    df_true = get_true_values()
    rmse, mae, r2 = get_evaluation_metrics(df_true, df_= df_.iloc[:,:-1])

    df_mean = df.fillna(df.mean())
    rmse_b, mae_b, r2_b = get_evaluation_metrics(df_true, df_= df_mean)

    table = {
        "Model": ["RMSE", "MAE", "R² Score"],
        "Cluster_impt": [rmse, mae, r2],
        "Mean Baseline": [rmse_b, mae_b, r2_b]
    }

    return pd.DataFrame(table)


In [38]:
def apply_data_modifications(df_):
    # Temporarily impute using mode
    imputer = SimpleImputer(strategy='most_frequent')
    X_temp = imputer.fit_transform(df_)

    # Scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_temp)

    return X_scaled


In [39]:
def get_aic_bic(k_range, X_scaled):
    # k_range = range(1, 15)
    bics = []
    aics = []

    for k in k_range:
        gmm = GaussianMixture(n_components=k, n_init=10, covariance_type='diag', random_state=42)
        gmm.fit(X_scaled)
        bics.append(gmm.bic(X_scaled))
        aics.append(gmm.aic(X_scaled))

    return aics, bics


def plot_aic_bic(k_range, aics, bics):
    # Plot to choose optimal K
    plt.plot(k_range, bics, label='BIC')
    plt.plot(k_range, aics, label='AIC')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Score')
    plt.title('Model Selection for GMM')
    plt.legend()
    plt.show()


In [40]:
def add_cluster_labels(df_, k_chosen, X_scaled):
    # fit GMM
    gmm = GaussianMixture(n_components=k_chosen, n_init=10, covariance_type="diag", random_state=42)
    cluster_labels = gmm.fit_predict(X_scaled)

    df_['cluster'] = cluster_labels

In [41]:
def impute_cluster_mean(df_copy):
    # Cluster-based mean imputation
    for feature in df_copy.columns[:-1]:
        for cluster_id in df_copy["cluster"].unique():
            cluster_mean = df_copy.loc[
                (df_copy[feature].notnull()) & (df_copy["cluster"] == cluster_id), feature
            ].mean()

            if pd.notna(cluster_mean):
                fill_value = int(round(cluster_mean))
            else:
                fallback_mean = df_copy[feature].mean()
                fill_value = int(round(fallback_mean))

            mask = (df_copy[feature].isnull()) & (df_copy["cluster"] == cluster_id)
            df_copy.loc[mask, feature] = int(round(fill_value))


In [42]:
def get_true_values():
    file_path = f"{config.DATA_PATH}/netflix_complete.txt"
    df_true = pd.read_csv(file_path, sep=' ', header=None)

    return df_true


In [43]:
def get_evaluation_metrics(df_true, df_):
    true_ratings, predicted_ratings = df_true.values, df_.values
    rmse = root_mean_squared_error(true_ratings, predicted_ratings)
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    r2 = r2_score(true_ratings, predicted_ratings)

    return rmse, mae, r2