In [None]:
!pip install thefuzz[speedup]


In [None]:
!pip install numpy==1.26.4


In [None]:
!pip install scikit-surprise


In [2]:
import pandas as pd



In [None]:
!unzip /content/ml-latest-small.zip

In [3]:
ratings = pd.read_csv("/content/ml-latest-small/ratings.csv")
movies = pd.read_csv("/content/ml-latest-small/movies.csv")
links = pd.read_csv("/content/ml-latest-small/links.csv")
tags = pd.read_csv("/content/ml-latest-small/tags.csv")

In [4]:
#data cleaning
movies['genres'] = movies['genres'].str.replace('|',' ')

In [5]:

tags['tag']=tags['tag'].astype(str)
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x:" ".join(x)).reset_index()

In [6]:
movies = movies.merge(movie_tags, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

In [7]:
movies.columns = movies.columns.str.replace(' ','')

In [8]:
movies['content'] = movies['genres']+" "+movies['tag']

In [9]:
movies.head()

Unnamed: 0,movieId,title,genres,tag,content
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun,Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game,Adventure Children Fantasy fantasy magic board...
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old,Comedy Romance moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance,,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,Comedy pregnancy remake


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['content'])

In [11]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [12]:
movies['clean_title'] = movies['title'].str.lower().str.strip()


In [13]:
from thefuzz import process

def match_title(user_title):
    user_title = user_title.lower().strip()

    choices = movies['clean_title'].tolist()
    match, score = process.extractOne(user_title, choices)

    if score<60:
        return None

    return movies.loc[movies['clean_title'] == match, 'title'].iloc[0]


In [14]:
def get_content_scores(title):
    matched_title = match_title(title)
    if matched_title is None:
        return None

    idx = movies[movies['title'] == matched_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))

    # return all similarity scores
    results = []
    for movie_index, score in sim_scores:
        movie_id = movies.iloc[movie_index]['movieId']
        results.append((movie_id, score))

    return results

In [15]:
get_content_scores('tangled')

[(1, 0.2188563713015968),
 (2, 0.14676735040937622),
 (3, 0.08561742775190856),
 (4, 0.3068424252559366),
 (5, 0.0337698485368292),
 (6, 0.0),
 (7, 0.14077286002725056),
 (8, 0.28743172115252286),
 (9, 0.0),
 (10, 0.0),
 (11, 0.10155870981520966),
 (12, 0.10089958083139666),
 (13, 0.4625655262553989),
 (14, 0.0),
 (15, 0.1623142219961944),
 (16, 0.0),
 (17, 0.063561750709783),
 (18, 0.19804616111291484),
 (19, 0.19804616111291484),
 (20, 0.06975766627350208),
 (21, 0.039095867828468786),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.088517835031905),
 (26, 0.0),
 (27, 0.3349009291720822),
 (28, 0.05462739113285821),
 (29, 0.11190416588901447),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (34, 0.056724394745227585),
 (36, 0.0),
 (38, 0.1904250324932225),
 (39, 0.043323680389173336),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.23496920423630674),
 (45, 0.045881937121859236),
 (46, 0.08994263660858764),
 (47, 0.0),
 (48, 0.7291225727259073),
 (49, 0.23963650754419386),
 (50, 0.0),
 (52,

## Collaborative Filtering

In [16]:
ratings = ratings.dropna(subset=["userId", "movieId", "rating"])

In [17]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
user_enc = LabelEncoder()
item_enc = LabelEncoder()

ratings['userId_idx'] = user_enc.fit_transform(ratings['userId'])
ratings['movieId_idx'] = item_enc.fit_transform(ratings['movieId'])

In [18]:
num_users = ratings['userId_idx'].nunique()
num_items = ratings['movieId_idx'].nunique()

In [19]:
print("sample mapped rows:", ratings[["userId","movieId","userId_idx","movieId_idx","rating"]].head(5).to_dict(orient="records"))

sample mapped rows: [{'userId': 1, 'movieId': 1, 'userId_idx': 0, 'movieId_idx': 0, 'rating': 4.0}, {'userId': 1, 'movieId': 3, 'userId_idx': 0, 'movieId_idx': 2, 'rating': 4.0}, {'userId': 1, 'movieId': 6, 'userId_idx': 0, 'movieId_idx': 5, 'rating': 4.0}, {'userId': 1, 'movieId': 47, 'userId_idx': 0, 'movieId_idx': 43, 'rating': 5.0}, {'userId': 1, 'movieId': 50, 'userId_idx': 0, 'movieId_idx': 46, 'rating': 5.0}]


In [20]:
train_df, test_df = train_test_split(ratings[["userId_idx","movieId_idx","rating","userId","movieId"]], test_size=0.2,random_state=42)

In [21]:
print("train rows:", len(train_df), "test rows:", len(test_df))

train rows: 80668 test rows: 20168


In [22]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Dense, Concatenate, Input
from tensorflow.keras.models import Model

EMBED_DIM = 32


#Define Input
user_input = Input(shape=(1,), dtype="int32", name="user")
movie_input = Input(shape=(1,), dtype="int32", name="item")

#Embedding
mlp_user_embedding = Embedding(input_dim=num_users, output_dim=EMBED_DIM)(user_input)
mlp_user_embedding = Flatten()(mlp_user_embedding)

mlp_movie_embedding = Embedding(input_dim=num_items, output_dim=EMBED_DIM)(movie_input)
mlp_movie_embedding = Flatten()(mlp_movie_embedding)

#Combine both embeddings
combined = Concatenate()([mlp_user_embedding,mlp_movie_embedding])


#Neural Network
mlp_vector = Dense(64, activation="relu")(combined)
mlp_vector = Dense(32, activation ="relu")(mlp_vector)
mlp_vector = Dense(16, activation="relu")(mlp_vector)

output = Dense(1, activation="sigmoid")(mlp_vector)

model = Model(inputs = [user_input,movie_input], outputs = output)

#Compiling the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()



In [23]:
ratings["label"] = ratings['rating'].apply(lambda x:1 if x>=3.5 else 0)


In [24]:
import numpy as np
from tensorflow.keras.layers import Multiply

EMBED_DIM_GMF = 16



#gmf enbedding
gmf_user_emb = Embedding(num_users, EMBED_DIM_GMF)(user_input)
gmf_item_emb = Embedding(num_items,EMBED_DIM_GMF)(movie_input)

gmf_user_vec = Flatten()(gmf_user_emb)
gmf_item_vec = Flatten()(gmf_item_emb)

gmf_vector = Multiply()([gmf_user_vec,gmf_item_vec])


In [25]:
#NeuMF (fusion)

neu_vector = Concatenate()([gmf_vector, mlp_vector])
output = Dense(1, activation="sigmoid")(neu_vector)

neu_model = Model(inputs = [user_input,movie_input], outputs = output)

neu_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

neu_model.summary()


In [26]:
user_ids = ratings['userId_idx'].values
item_ids = ratings['movieId_idx'].values
labels = ratings['label'].values


In [27]:
history = neu_model.fit([user_ids,item_ids],labels,batch_size=256,epochs=5,validation_split=0.1)

Epoch 1/5
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.6660 - loss: 0.6108 - val_accuracy: 0.6239 - val_loss: 0.6453
Epoch 2/5
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.7640 - loss: 0.4933 - val_accuracy: 0.6280 - val_loss: 0.6424
Epoch 3/5
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.7860 - loss: 0.4523 - val_accuracy: 0.6225 - val_loss: 0.6613
Epoch 4/5
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.8217 - loss: 0.3961 - val_accuracy: 0.6213 - val_loss: 0.6728
Epoch 5/5
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.8654 - loss: 0.3269 - val_accuracy: 0.6289 - val_loss: 0.6836


In [29]:
neu_model.save("neu_model.keras")
import pickle
pickle.dump(user_enc, open("user_enc.pkl", "wb"))
pickle.dump(item_enc, open("movie_enc.pkl", "wb"))


In [30]:

from tensorflow.keras.models import load_model
import pickle

final_model = load_model("neu_model.keras")
user_enc = pickle.load(open("user_enc.pkl", "rb"))
movie_enc = pickle.load(open("movie_enc.pkl", "rb"))


  saveable.load_own_variables(weights_store.get(inner_path))


In [31]:
def predict_user_movie_rating(user_id, movie_id):
    # Convert original IDs → encoded IDs used in training
    u = user_enc.transform([user_id])[0]
    m = movie_enc.transform([movie_id])[0]

    # Reshape because Keras expects 2D input
    u = u.reshape(1, 1)
    m = m.reshape(1, 1)

    # Make prediction
    pred = final_model.predict([u, m], verbose=0)[0][0]
    return float(pred)

In [32]:
print(predict_user_movie_rating(1, 50))


0.9797450304031372


In [33]:
import numpy as np

def predict_all_movies_for_user(user_id, movies):
    # Only keep movies that were present during training
    valid_movie_ids = movie_enc.classes_.astype(int)
    movies = movies[movies['movieId'].isin(valid_movie_ids)]

    movie_ids = movies['movieId'].values

    # Encode user (one scalar repeated)
    u = user_enc.transform([user_id])[0]
    users = np.full(len(movie_ids), u)

    # Encode movies
    m = movie_enc.transform(movie_ids)

    # Reshape for Keras
    users = users.reshape(-1, 1)
    m = m.reshape(-1, 1)

    # Predict
    preds = final_model.predict([users, m], verbose=0).flatten()

    return movie_ids, preds, movies


In [34]:
def recommend_movies_tf(user_id, movies, top_k=10):
    movie_ids, preds, movies = predict_all_movies_for_user(user_id, movies)

    # Top-k
    top_idx = preds.argsort()[-top_k:][::-1]

    recommendations = []
    for idx in top_idx:
        mid = movie_ids[idx]
        title = movies.loc[movies['movieId'] == mid, 'title'].values[0]
        score = preds[idx]
        recommendations.append((mid, title, float(score)))

    return recommendations


In [35]:
recommendations = recommend_movies_tf(1, movies, top_k=10)
for mid, title, score in recommendations:
    print(f"{mid} | {title} | {score:.4f}")


33660 | Cinderella Man (2005) | 0.9995
55442 | Persepolis (2007) | 0.9994
44555 | Lives of Others, The (Das leben der Anderen) (2006) | 0.9992
7156 | Fog of War: Eleven Lessons from the Life of Robert S. McNamara, The (2003) | 0.9990
27846 | Corporation, The (2003) | 0.9990
951 | His Girl Friday (1940) | 0.9989
1950 | In the Heat of the Night (1967) | 0.9989
2176 | Rope (1948) | 0.9988
96829 | Hunt, The (Jagten) (2012) | 0.9988
1446 | Kolya (Kolja) (1996) | 0.9987


In [None]:
#hybrid recommendation

In [36]:
def hybrid_recommend(user_id, movie_title, top_k=10, alpha=0.7):
    content_scores = get_content_scores(movie_title)
    if content_scores is None:
        return None

    results = []

    for movie_id, content_sim in content_scores:
        # Skip the movie itself
        if content_sim == 1.0:
            continue

        # Only score movies learned by NeuMF
        if movie_id not in movie_enc.classes_:
            continue

        # NeuMF score
        try:
            cf_score = predict_user_movie_rating(user_id, movie_id)
        except:
            continue

        # Weighted hybrid
        final_score = alpha * cf_score + (1 - alpha) * content_sim

        # Get title
        row = movies[movies['movieId'] == movie_id]
        if len(row) == 0:
            continue

        title = row.iloc[0]['title']
        results.append((title, final_score))

    # Sort by hybrid score
    results = sorted(results, key=lambda x: x[1], reverse=True)

    return results[:top_k]


In [37]:
import numpy as np

def hybrid_recommend_fast(user_id, movie_title, top_k=10, alpha=0.7):
    content_scores = get_content_scores(movie_title)
    if content_scores is None:
        return None

    # Filter out movies not in NeuMF
    valid_scores = [(mid, sim) for mid, sim in content_scores if mid in movie_enc.classes_ and sim < 1.0]
    if not valid_scores:
        return None

    movie_ids, content_sims = zip(*valid_scores)
    movie_ids = np.array(movie_ids)
    content_sims = np.array(content_sims)

    # Vectorize user ID
    u = user_enc.transform([user_id])[0]
    users = np.full(len(movie_ids), u)

    # Encode movies
    m = movie_enc.transform(movie_ids)

    # Reshape for Keras
    users = users.reshape(-1, 1)
    m = m.reshape(-1, 1)

    # Predict all at once
    cf_scores = final_model.predict([users, m], verbose=0).flatten()

    # Hybrid score
    hybrid_scores = alpha * cf_scores + (1 - alpha) * content_sims

    # Build final list with titles
    results = []
    for mid, score in zip(movie_ids, hybrid_scores):
        row = movies[movies['movieId'] == mid]
        if len(row) == 0:
            continue
        results.append((row.iloc[0]['title'], float(score)))

    # Sort and pick top_k
    results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]
    return results


In [46]:
hybrid = hybrid_recommend_fast(1, "tangeled", top_k=10)
for title, score in hybrid:
    print(title, score)

Princess and the Frog, The (2009) 0.9452669037780099
Toy Story 3 (2010) 0.9359240409594903
Christmas Carol, A (2009) 0.9244358592041166
Shrek Forever After (a.k.a. Shrek: The Final Chapter) (2010) 0.9228269931537042
Beauty and the Beast: The Enchanted Christmas (1997) 0.9184545914082941
How to Train Your Dragon (2010) 0.9157177174668014
Despicable Me 2 (2013) 0.9153122790291461
Fantasia 2000 (1999) 0.914229651577074
Aladdin and the King of Thieves (1996) 0.9135341117103077
Rudolph, the Red-Nosed Reindeer (1964) 0.9073793632430062


In [47]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
