In [1]:
!pip uninstall -y numpy scikit-surprise
!pip install numpy==1.26.4
!pip install scikit-surprise


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scikit-surprise 1.1.4
Uninstalling scikit-surprise-1.1.4:
  Successfully uninstalled scikit-surprise-1.1.4
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
shap 0.50.0 requir

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [1]:
import numpy as np
print("NumPy version:", np.__version__)

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

print("✅ scikit-surprise working correctly")


NumPy version: 1.26.4
✅ scikit-surprise working correctly


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os

BASE_PATH = "/content/drive/MyDrive/Colab Notebooks/Intelligent reccom"
DATA_PATH = BASE_PATH + "/ml-100k"

print("BASE exists:", os.path.exists(BASE_PATH))
print("DATA exists:", os.path.exists(DATA_PATH))
print(os.listdir(DATA_PATH))


BASE exists: True
DATA exists: True
['allbut.pl', 'u1.base', 'u3.base', 'u.occupation', 'ub.base', 'u4.test', 'mku.sh', 'u1.test', 'u.item', 'u.data', 'u2.test', 'ua.test', 'u5.test', 'ua.base', 'u4.base', 'README', 'u.info', 'u.user', 'u.genre', 'u3.test', 'u2.base', 'u5.base', 'ub.test']


In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity


In [5]:
ratings = pd.read_csv(
    DATA_PATH + "/u.data",
    sep="\t",
    names=["user_id", "item_id", "rating", "timestamp"]
)


In [6]:
users = pd.read_csv(
    DATA_PATH + "/u.user",
    sep="|",
    names=["user_id", "age", "gender", "occupation", "zip_code"]
)


In [7]:
items = pd.read_csv(
    DATA_PATH + "/u.item",
    sep="|",
    encoding="latin-1",
    header=None
)

items = items[[0, 1, 2]]
items.columns = ["item_id", "title", "release_date"]


In [8]:
print(ratings.shape)  # (100000, 4)
print(users.shape)    # (943, 5)
print(items.shape)    # (1682, 3)


(100000, 4)
(943, 5)
(1682, 3)


In [9]:
data = ratings.merge(users, on="user_id")
data = data.merge(items, on="item_id")


In [10]:
data["clicked"] = (data["rating"] >= 4).astype(int)


In [11]:
le_gender = LabelEncoder()
le_occupation = LabelEncoder()

users["gender_enc"] = le_gender.fit_transform(users["gender"])
users["occupation_enc"] = le_occupation.fit_transform(users["occupation"])


In [12]:
data = data.sort_values("timestamp")

train_end = int(0.7 * len(data))
val_end = int(0.85 * len(data))

train_data = data.iloc[:train_end]
val_data = data.iloc[train_end:val_end]
test_data = data.iloc[val_end:]


In [13]:
all_items = set(data["item_id"].unique())
negative_samples = []

for user in train_data["user_id"].unique():
    pos_items = set(train_data[train_data["user_id"] == user]["item_id"])
    neg_items = list(all_items - pos_items)

    sampled = np.random.choice(neg_items, size=min(5, len(neg_items)), replace=False)
    for item in sampled:
        negative_samples.append([user, item, 0])

negative_df = pd.DataFrame(
    negative_samples,
    columns=["user_id", "item_id", "clicked"]
)


In [14]:
positive_df = train_data[["user_id", "item_id", "clicked"]]
ranking_data = pd.concat([positive_df, negative_df], ignore_index=True)

ranking_data = ranking_data.merge(
    users[["user_id", "age", "gender_enc", "occupation_enc"]],
    on="user_id",
    how="left"
)


In [15]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_split


In [16]:
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(
    ratings[["user_id", "item_id", "rating"]],
    reader
)

trainset, _ = surprise_split(surprise_data, test_size=0.2, random_state=42)

svd_model = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)

svd_model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x793390bbce90>

In [17]:
def svd_candidates(user_id, k=50):
    items_all = ratings["item_id"].unique()
    preds = [(iid, svd_model.predict(user_id, iid).est) for iid in items_all]
    preds.sort(key=lambda x: x[1], reverse=True)
    return [iid for iid, _ in preds[:k]]


In [18]:
items["title_clean"] = items["title"].str.lower()

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf.fit_transform(items["title_clean"])


In [19]:
def content_candidates(item_id, k=20):
    idx = items.index[items["item_id"] == item_id][0]
    sims = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    top_idx = sims.argsort()[::-1][1:k+1]
    return items.iloc[top_idx]["item_id"].tolist()


In [20]:
def hybrid_candidates(user_id, last_item_id):
    return list(set(
        svd_candidates(user_id, 70) +
        content_candidates(last_item_id, 30)
    ))


In [21]:
X = ranking_data[["age", "gender_enc", "occupation_enc"]].fillna(0)
y = ranking_data["clicked"]

rank_model = LogisticRegression(max_iter=1000)
rank_model.fit(X, y)


In [22]:
def rank_items(user_id, candidate_items):
    user = users[users["user_id"] == user_id].iloc[0]

    features = [[
        user["age"],
        user["gender_enc"],
        user["occupation_enc"]
    ] for _ in candidate_items]

    scores = rank_model.predict_proba(features)[:, 1]
    ranked = sorted(zip(candidate_items, scores), key=lambda x: x[1], reverse=True)
    return [item for item, _ in ranked]


In [23]:
def recommend(user_id, last_item_id, top_n=10):
    candidates = hybrid_candidates(user_id, last_item_id)
    ranked = rank_items(user_id, candidates)
    return ranked[:top_n]


In [24]:
u = ratings.iloc[0]["user_id"]
i = ratings.iloc[0]["item_id"]

recs = recommend(u, i)
items[items["item_id"].isin(recs)][["item_id", "title"]]




Unnamed: 0,item_id,title
11,12,"Usual Suspects, The (1995)"
512,513,"Third Man, The (1949)"
514,515,"Boot, Das (1981)"
518,519,"Treasure of the Sierra Madre, The (1948)"
519,520,"Great Escape, The (1963)"
522,523,Cool Hand Luke (1967)
523,524,"Great Dictator, The (1940)"
524,525,"Big Sleep, The (1946)"
527,528,"Killing Fields, The (1984)"
1039,1040,Two if by Sea (1996)


In [25]:
def evaluate(sample_df):
    hits = 0
    for _, row in sample_df.iterrows():
        recs = recommend(row["user_id"], row["item_id"])
        if row["item_id"] in recs:
            hits += 1
    return hits / len(sample_df)

print("CTR:", evaluate(test_data.sample(500, random_state=42)))




CTR: 0.016




In [26]:
logs = []

for _, row in test_data.sample(100).iterrows():
    recs = recommend(row["user_id"], row["item_id"])
    for item in recs:
        logs.append({
            "user_id": row["user_id"],
            "item_id": item,
            "clicked": int(item == row["item_id"])
        })

logs_df = pd.DataFrame(logs)

if logs_df["clicked"].mean() < 0.1:
    print("⚠ Retraining required")
else:
    print("✅ System healthy")




⚠ Retraining required




In [27]:
import os

SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/Intelligent reccom/saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

SAVE_DIR


'/content/drive/MyDrive/Colab Notebooks/Intelligent reccom/saved_models'

In [28]:
import joblib


In [29]:
joblib.dump(rank_model, f"{SAVE_DIR}/ranking_model.pkl")


['/content/drive/MyDrive/Colab Notebooks/Intelligent reccom/saved_models/ranking_model.pkl']

In [30]:
joblib.dump(tfidf, f"{SAVE_DIR}/tfidf_vectorizer.pkl")


['/content/drive/MyDrive/Colab Notebooks/Intelligent reccom/saved_models/tfidf_vectorizer.pkl']

In [31]:
joblib.dump(le_gender, f"{SAVE_DIR}/gender_encoder.pkl")
joblib.dump(le_occupation, f"{SAVE_DIR}/occupation_encoder.pkl")


['/content/drive/MyDrive/Colab Notebooks/Intelligent reccom/saved_models/occupation_encoder.pkl']

In [32]:
items.to_csv(f"{SAVE_DIR}/items_metadata.csv", index=False)


In [35]:
# Create user-item interaction matrix
user_item = ratings.pivot_table(
    index="user_id",
    columns="item_id",
    values="rating"
).fillna(0)

user_item.shape


(943, 1682)

In [36]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=42)

user_embeddings = svd.fit_transform(user_item)
item_embeddings = svd.components_.T

print(user_embeddings.shape)
print(item_embeddings.shape)


(943, 50)
(1682, 50)


In [37]:
import joblib

joblib.dump(user_embeddings, f"{SAVE_DIR}/user_embeddings.pkl")
joblib.dump(item_embeddings, f"{SAVE_DIR}/item_embeddings.pkl")
joblib.dump(svd, f"{SAVE_DIR}/svd_sklearn.pkl")


['/content/drive/MyDrive/Colab Notebooks/Intelligent reccom/saved_models/svd_sklearn.pkl']