In [42]:
# Importing Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# Reads the File
movies = pd.read_csv('movies.dat', sep='::', header=None, names=['movieid', 'title', 'genres'], encoding='ISO-8859-1')

In [3]:
movies.head(10)

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
# Create a DataFrame
movies = pd.DataFrame(movies)

# Convert the Genres column into a list format
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

In [5]:
# Display the updated DataFrame
movies.head()

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [6]:
movies.shape

(3883, 3)

In [39]:
movies.isnull().sum()

movieid    0
title      0
genres     0
dtype: int64

In [7]:
ratings = pd.read_csv('ratings.dat', sep='::', header=None, names=['userid', 'movieid', 'rating', 'timestamp'], encoding='ISO-8859-1')

In [8]:
ratings.head(10)

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [40]:
ratings.isnull().sum()

userid       0
movieid      0
rating       0
timestamp    0
dtype: int64

In [9]:
users = pd.read_csv('users.dat', sep='::', header=None, names=['userid', 'gender', 'age', 'occupation', 'zipcode'], encoding='ISO-8859-1')

In [10]:
users.head(10)

Unnamed: 0,userid,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


In [11]:
# Merge ratings with movies on MovieID
ratings_movies = pd.merge(ratings, movies, on='movieid')

# Merge the resulting dataframe with users on UserID
data = pd.merge(ratings_movies, users, on='userid')

# Display the merged dataset
data.head()

Unnamed: 0,userid,movieid,rating,timestamp,title,genres,gender,age,occupation,zipcode
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),"[Musical, Romance]",F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),[Drama],F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",F,1,10,48067


In [12]:
# Converts the Gender Column into 0 and 1
labelencoder = LabelEncoder()

data['gender'] = labelencoder.fit_transform(data['gender'])

In [13]:
data.tail()

Unnamed: 0,userid,movieid,rating,timestamp,title,genres,gender,age,occupation,zipcode
1000204,4211,3791,2,965319075,Footloose (1984),[Drama],1,45,5,77662
1000205,4211,3806,3,965319138,MacKenna's Gold (1969),[Western],1,45,5,77662
1000206,4211,3840,4,965319197,Pumpkinhead (1988),[Horror],1,45,5,77662
1000207,4211,3766,2,965319138,Missing in Action (1984),"[Action, War]",1,45,5,77662
1000208,4211,3834,2,965318885,Bronco Billy (1980),"[Adventure, Drama, Romance]",1,45,5,77662


In [14]:
data.shape

(1000209, 10)

In [15]:
# Checks for Null Values
data.isnull().sum()

userid        0
movieid       0
rating        0
timestamp     0
title         0
genres        0
gender        0
age           0
occupation    0
zipcode       0
dtype: int64

In [16]:
# Drop Unnecessary Columns
data.drop(columns=['timestamp','zipcode'], inplace=True)

In [87]:
# Prepare the data for collaborative filtering (user-item matrix)
user_item_matrix = data.pivot(index='userid', columns='movieid', values='rating').fillna(0)

In [88]:
# Split the data into training and testing sets
X_train, X_test = train_test_split(user_item_matrix, test_size=0.25, random_state=42)

In [89]:
# Train the model using K-Nearest Neighbors for user-based collaborative filtering
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(X_train)

NearestNeighbors(algorithm='brute', metric='cosine')

In [90]:
# Function to recommend movies based on the nearest neighbors
def recommend_movies_knn(user_id, model, user_item_matrix, n=10):
    distances, indices = model.kneighbors([user_item_matrix.loc[user_id]], n_neighbors=n+1)
    recommendations = []

    for i in range(1, len(distances.flatten())):
        idx = user_item_matrix.index[indices.flatten()[i]]
        recommendations.append(idx)
    
    return recommendations

In [91]:
# Example: Recommend top 10 movies for User 1
recommended_users = recommend_movies_knn(user_id=1, model=model, user_item_matrix=user_item_matrix, n=10)
print("Recommended Users Similar to User 1:", recommended_users)

Recommended Users Similar to User 1: [853, 98, 4354, 479, 3745, 2601, 830, 3491, 2169, 1410]
