# Content Based Recommendation System using Neural Networks

## Importing libraries and Loading the data

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [16]:
links_df = pd.read_csv('links.csv')
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
tags_df = pd.read_csv('tags.csv')

In [17]:
links_df.info()
links_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [18]:
movies_df.info()
movies_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [19]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [20]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## 1. Data Preprocessing
### 1.1. Movies Data:
Parse the genres column into a one-hot encoding format.
Use the tags dataset to augment movie features (e.g., count tags or create embeddings).
### 1.2. Ratings Data:
Normalize ratings if needed.
Create a training dataset by combining ratings_df with processed movie features.

In [21]:
# 1. Data Preprocessing
# One-hot encode genres
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies_df['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
movies_df = pd.concat([movies_df, genres_df], axis=1).drop('genres', axis=1)

## 2. Feature Representation
### 2.1. Movie Features: Combine one-hot encoded genres and aggregated tags.
### 2.2. User Preferences: Represent users as the average feature vector of movies they've rated highly.

In [22]:
# Merge movie features with ratings
ratings_with_features = ratings_df.merge(movies_df, on='movieId')

# Normalize ratings
ratings_with_features['rating'] = ratings_with_features['rating'] / 5.0

# Split into training and testing sets
X = ratings_with_features.drop(columns=['userId', 'movieId', 'rating', 'title'])
y = ratings_with_features['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Neural Network Model
### Input: Movie features and user preferences.
### Output: Predicted rating.

### Architecture:
Fully connected layers to process input features.
Output layer with one neuron for regression

In [28]:
# 2. Neural Network Model
model = Sequential([
    Dense(128, activation='relu', input_dim=X_train.shape[1]),
    Dense(64, activation='relu'),
    Dense(1)  # Single output for regression
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 151524248387584.0000 - mae: 2987651.2500 - val_loss: 267.9388 - val_mae: 12.8322
Epoch 2/10
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 113711030272.0000 - mae: 8437.0127 - val_loss: 51447910400.0000 - val_mae: 223233.0625
Epoch 3/10
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 22614351872.0000 - mae: 11769.9697 - val_loss: 96093568.0000 - val_mae: 9647.6719
Epoch 4/10
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 446891524096.0000 - mae: 120164.7188 - val_loss: 191.1581 - val_mae: 10.8218
Epoch 5/10
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 628511408128.0000 - mae: 226355.6562 - val_loss: 1064921858048.0000 - val_mae: 1015624.7500
Epoch 6/10
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 57231

In [31]:
from tensorflow.keras.layers import Embedding, Flatten

model = Sequential([
    Dense(128, activation='relu', input_dim=X_train.shape[1]),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Regression output
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## 4. Recommendation Logic
For a given user, predict ratings for all movies.
Recommend movies with the highest predicted ratings that the user hasn’t rated yet.

In [32]:
# Save the feature columns used during training
feature_columns = X_train.columns

def recommend_movies(user_id, num_recommendations=5):
    # Get movies the user hasn't rated
    user_rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId']
    unrated_movies = movies_df[~movies_df['movieId'].isin(user_rated_movies)].copy()  # Use .copy() to avoid warnings

    # Ensure the feature columns match the training data
    movie_features = unrated_movies.drop(columns=['movieId', 'title'])
    movie_features = movie_features.reindex(columns=feature_columns, fill_value=0)

    # Predict ratings
    predicted_ratings = model.predict(movie_features)

    # Add predictions to movies safely
    unrated_movies.loc[:, 'predicted_rating'] = predicted_ratings.flatten()

    # Sort by predicted ratings
    recommendations = unrated_movies.sort_values(by='predicted_rating', ascending=False)

    return recommendations[['title', 'predicted_rating']].head(num_recommendations)


# Example: Recommend movies for user 1
recommendations = recommend_movies(user_id=1)
print(recommendations)

[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
                                 title  predicted_rating
3797  Watcher in the Woods, The (1980)          0.065843
7467                    Tangled (2010)          0.062067
3855                 Scooby-Doo (2002)          0.058466
7961                 ParaNorman (2012)          0.057789
8189        Monsters University (2013)          0.057789
