In [1]:
# --- install (only if needed) ---
# pip install pandas numpy scikit-learn scipy tqdm

# --- imports ---
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error

print("Libraries imported successfully.")


Libraries imported successfully.


In [3]:
USERS_CSV = "DataSets/Users.csv"
RATINGS_CSV = "DataSets/Ratings.csv"
OUTPUT_DIR = "DataSets/outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TRAIN_LIBSVM = os.path.join(OUTPUT_DIR, "users_with_age.libsvm")
TEST_LIBSVM  = os.path.join(OUTPUT_DIR, "users_without_age.libsvm")
PRED_CSV     = os.path.join(OUTPUT_DIR, "age_predictions.csv")

print("Paths configured:")
print(f"Users: {USERS_CSV}")
print(f"Ratings: {RATINGS_CSV}")
print(f"Output Directory: {OUTPUT_DIR}")


Paths configured:
Users: DataSets/Users.csv
Ratings: DataSets/Ratings.csv
Output Directory: DataSets/outputs


In [5]:
users_df_raw = pd.read_csv(USERS_CSV)

colname = users_df_raw.columns[0]
users_df = users_df_raw[colname].str.split(";", expand=True)
users_df.columns = ["User-ID", "Age"]

users_df["User-ID"] = users_df["User-ID"].astype(str).str.strip()
users_df["Age"] = pd.to_numeric(users_df["Age"], errors="coerce")

users_with_age = users_df[users_df["Age"].notna()].copy()
users_without_age = users_df[users_df["Age"].isna()].copy()

print(f"Users loaded: {len(users_df)} total")
print(f"Users with Age: {len(users_with_age)} | Users without Age: {len(users_without_age)}")


Users loaded: 278859 total
Users with Age: 167151 | Users without Age: 111708


In [7]:
ratings = pd.read_csv(RATINGS_CSV, sep=";")
ratings["User-ID"] = ratings["User-ID"].astype(str).str.strip()
ratings["ISBN"] = ratings["ISBN"].astype(str).str.strip()
ratings["Rating"] = pd.to_numeric(ratings["Rating"], errors="coerce").fillna(0).astype(int)

age_map = dict(zip(users_with_age["User-ID"], users_with_age["Age"].astype(int)))
ratings["Age"] = ratings["User-ID"].map(age_map)

ratings_with_age = ratings[ratings["Age"].notna()].copy()
ratings_without_age = ratings[ratings["User-ID"].isin(users_without_age["User-ID"])].copy()

print("Ratings loaded and merged with age data.")
print(f"Ratings with age: {ratings_with_age.shape}")
print(f"Ratings without age: {ratings_without_age.shape}")


Ratings loaded and merged with age data.
Ratings with age: (836930, 4)
Ratings without age: (312850, 4)


In [9]:
all_isbns = pd.Index(ratings["ISBN"].unique())
isbn_to_col = {isbn: i for i, isbn in enumerate(all_isbns)}

train_users = ratings_with_age["User-ID"].unique()
u_idx_train = {u: i for i, u in enumerate(train_users)}

rows = ratings_with_age["User-ID"].map(u_idx_train).values
cols = ratings_with_age["ISBN"].map(isbn_to_col.get).values
data = ratings_with_age["Rating"].values
X_train = coo_matrix((data, (rows, cols)), shape=(len(train_users), len(all_isbns))).tocsr()
y_train = ratings_with_age.groupby("User-ID")["Age"].first().reindex(train_users).astype(int).values

test_users = ratings_without_age["User-ID"].unique()
u_idx_test = {u: i for i, u in enumerate(test_users)}

rows_t = ratings_without_age["User-ID"].map(u_idx_test).values
cols_t = ratings_without_age["ISBN"].map(isbn_to_col.get).values
data_t = ratings_without_age["Rating"].values
X_test = coo_matrix((data_t, (rows_t, cols_t)), shape=(len(test_users), len(all_isbns))).tocsr()

print("Sparse matrices created:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}")


Sparse matrices created:
X_train: (61815, 340553), y_train: (61815,), X_test: (43468, 340553)


In [11]:
def to_libsvm(X_csr: csr_matrix, y_or_neg1, out_path: str):
    with open(out_path, "w", encoding="utf-8") as f:
        for i in range(X_csr.shape[0]):
            start, end = X_csr.indptr[i], X_csr.indptr[i+1]
            idxs = X_csr.indices[start:end] + 1
            vals = X_csr.data[start:end]
            label = y_or_neg1[i] if isinstance(y_or_neg1, np.ndarray) else y_or_neg1
            line = str(int(label)) + " " + " ".join(f"{j}:{float(v)}" for j, v in zip(idxs, vals))
            f.write(line + "\n")

to_libsvm(X_train, y_train, TRAIN_LIBSVM)
to_libsvm(X_test, -1, TEST_LIBSVM)
print(f"LIBSVM files written successfully:\n{TRAIN_LIBSVM}\n{TEST_LIBSVM}")


LIBSVM files written successfully:
DataSets/outputs/users_with_age.libsvm
DataSets/outputs/users_without_age.libsvm


In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(lambda yt, yp: -rmse(yt, yp))

scaler = StandardScaler(with_mean=False)
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=5.0, random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}

for name, model in models.items():
    scores = cross_val_score(model, X_train_sc, y_train, cv=cv, scoring=rmse_scorer, n_jobs=-1)
    cv_results[name] = -scores.mean()  # positive RMSE

print("Cross-validation complete. RMSE scores:")
for k, v in cv_results.items():
    print(f"  {k}: {v:.3f}")

best_model_name = min(cv_results, key=cv_results.get)
best_model = models[best_model_name]
print(f"*********Best Model Selected***********: {best_model_name}")


In [None]:
print(f"ðŸš€ Training the best model ({best_model_name})...")
best_model.fit(X_train_sc, y_train)
print("Training completed.")

pred_test = best_model.predict(X_test_sc)
pred_test = np.clip(pred_test, 5, 100)

pred_df = pd.DataFrame({
    "User-ID": test_users,
    "Estimated_Age": np.round(pred_test, 2)
})

pred_df.to_csv(PRED_CSV, index=False)
print(f"Predictions saved to {PRED_CSV}")
print("Sample predictions:")
print(pred_df.head())


In [None]:
isbn_by_col = np.array(all_isbns)
test_csr = X_test.tocsr()

book_lists = []
for i in tqdm(range(test_csr.shape[0]), desc="Collecting books per user"):
    start, end = test_csr.indptr[i], test_csr.indptr[i+1]
    cols_i = test_csr.indices[start:end]
    ratings_i = test_csr.data[start:end]
    books_rated = [f"{isbn_by_col[c]}:{int(r)}" for c, r in zip(cols_i, ratings_i)]
    book_lists.append(",".join(books_rated))

pred_books_df = pred_df.copy()
pred_books_df["Books_Rated"] = book_lists
pred_books_path = os.path.join(OUTPUT_DIR, "age_predictions_with_books.csv")
pred_books_df.to_csv(pred_books_path, index=False)

print(f"File saved with book info: {pred_books_path}")
print(pred_books_df.head())
