In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("netflix-inc/netflix-prize-data")

print("Path to dataset files:", path)

Path to dataset files: /Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2


In [3]:
files = [
    "/Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/combined_data_2.txt",
    "/Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/combined_data_3.txt",
    "/Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/combined_data_1.txt",
    "/Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/combined_data_4.txt"
]

data = []

for file in files:
    print(f"Opening file: {file}")
    with open(file) as f:
        for line in f:
            line = line.strip()
            if line.endswith(':'): 
                movie_id = int(line.replace(':', ''))
            else:  
                user_id, rating, date = line.split(',')
                data.append((movie_id, int(user_id), int(rating), date))

df = pd.DataFrame(data, columns=['movie_id', 'user_id', 'rating', 'date'])

df['date'] = pd.to_datetime(df['date'])

print(df.head())

Opening file: /Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/combined_data_2.txt
Opening file: /Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/combined_data_3.txt
Opening file: /Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/combined_data_1.txt
Opening file: /Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/combined_data_4.txt
   movie_id  user_id  rating       date
0      4500  2532865       4 2005-07-26
1      4500   573364       3 2005-06-20
2      4500  1696725       3 2004-02-27
3      4500  1253431       3 2004-03-31
4      4500  1265574       2 2003-09-01


In [4]:
pathnya = "/Users/shyraalexandria/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2/movie_titles.csv"
movie_title = pd.read_csv(
    pathnya,  
    encoding='latin1', 
    on_bad_lines='skip',  
    header=None 
)
movie_title


Unnamed: 0,0,1,2
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17429,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17430,17767,2004.0,Fidel Castro: American Experience
17431,17768,2000.0,Epoch
17432,17769,2003.0,The Company


In [None]:
movie_title = movie_title.set_axis(['movie_id', 'year', 'name'], axis=1)

movie_title



Unnamed: 0,movie_id,year,name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17429,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17430,17767,2004.0,Fidel Castro: American Experience
17431,17768,2000.0,Epoch
17432,17769,2003.0,The Company


In [None]:
movie_title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17434 entries, 0 to 17433
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   movie_id  17434 non-null  int64  
 1   year      17427 non-null  float64
 2   name      17434 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 408.7+ KB


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 4 columns):
 #   Column    Dtype         
---  ------    -----         
 0   movie_id  int64         
 1   user_id   int64         
 2   rating    int64         
 3   date      datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 3.0 GB


In [48]:
numerical = []
categorical = []

for i in df.columns:
  if 'int' in str(df[i].dtype) or 'float' in str(df[i].dtype):
    numerical.append(i)
  else:
    categorical.append(i)

print(f"Categorical : {categorical}")
print(f"numerical : {numerical}")

Categorical : ['date']
numerical : ['movie_id', 'user_id', 'rating']


In [31]:
movie_title['movie_id'] = movie_title['movie_id'].astype(int)
movie_title['year'] = pd.to_numeric(movie_title['year'], errors='coerce').astype('Int64') 
movie_title

Unnamed: 0,movie_id,year,name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17429,17766,2002,Where the Wild Things Are and Other Maurice Se...
17430,17767,2004,Fidel Castro: American Experience
17431,17768,2000,Epoch
17432,17769,2003,The Company


In [15]:
mergeddf = df.merge(movie_title[['movie_id', 'year', 'name']], on='movie_id', how='left')
mergeddf

Unnamed: 0,movie_id,user_id,rating,date,year,name
0,4500,2532865,4,2005-07-26,1945,Les Dames du Bois de Boulogne
1,4500,573364,3,2005-06-20,1945,Les Dames du Bois de Boulogne
2,4500,1696725,3,2004-02-27,1945,Les Dames du Bois de Boulogne
3,4500,1253431,3,2004-03-31,1945,Les Dames du Bois de Boulogne
4,4500,1265574,2,2003-09-01,1945,Les Dames du Bois de Boulogne
...,...,...,...,...,...,...
100480502,17770,1790158,4,2005-11-01,2003,Alien Hunter
100480503,17770,1608708,3,2005-07-19,2003,Alien Hunter
100480504,17770,234275,1,2004-08-07,2003,Alien Hunter
100480505,17770,255278,4,2004-05-28,2003,Alien Hunter


In [21]:
mergeddf = mergeddf.dropna()
mergeddf.isna().describe()

Unnamed: 0,movie_id,user_id,rating,date,year,name
count,98984986,98984986,98984986,98984986,98984986,98984986
unique,1,1,1,1,1,1
top,False,False,False,False,False,False
freq,98984986,98984986,98984986,98984986,98984986,98984986


In [24]:
mergeddf.head()

Unnamed: 0,movie_id,user_id,rating,date,year,name
0,4500,2532865,4,2005-07-26,1945,Les Dames du Bois de Boulogne
1,4500,573364,3,2005-06-20,1945,Les Dames du Bois de Boulogne
2,4500,1696725,3,2004-02-27,1945,Les Dames du Bois de Boulogne
3,4500,1253431,3,2004-03-31,1945,Les Dames du Bois de Boulogne
4,4500,1265574,2,2003-09-01,1945,Les Dames du Bois de Boulogne


In [41]:
user_rating = mergeddf.groupby('user_id')['rating'].size()
mov_rating = mergeddf.groupby('movie_id')['rating'].size()

In [42]:
top5000user = user_rating.sort_values(ascending=False).iloc[:5000]
top1000mov = mov_rating.sort_values(ascending=False).iloc[:1000]

In [43]:
top5000user = top5000user.rename('top 5000 users')
top1000mov = top1000mov.rename('top 1000 movies')

In [None]:
all_rating = mergeddf.merge(top5000user, left_on='user_id', right_index=True, how='inner')
all_rating = all_rating.merge(top1000mov, left_on='movie_id', right_index=True, how='inner')
all_rating['rating'].value_counts()


rating
4    1110518
3    1040353
5     680584
2     369472
1     148942
Name: count, dtype: int64

In [45]:
user_enc = LabelEncoder()
movie_enc = LabelEncoder()

all_rating['user'] = user_enc.fit_transform(all_rating['user_id'])
all_rating['movie'] = movie_enc.fit_transform(all_rating['movie_id'])

In [46]:
# Add separate columns for year, month, and day derived from the 'date' column
all_rating['year'] = all_rating['date'].dt.year
all_rating['month'] = all_rating['date'].dt.month
all_rating['day'] = all_rating['date'].dt.day

In [63]:
# Select features and target variable
features = all_rating[['user', 'movie', 'top 5000 users', 'top 1000 movies']]
target = all_rating['rating']

# Split dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a K-Fold cross-validator with 5 folds
cross_validator = KFold(n_splits=5, shuffle=True, random_state=179)


In [64]:
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [68]:
# Define the parameter grid for k-NN
param_grid_knn = {
    'n_neighbors': [5, 9, 17],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # p=1 for Manhattan, p=2 for Euclidean distance
}

# Initialize GridSearchCV with k-NN model and cross-validation
grid_search_knn = GridSearchCV(knn_model, param_grid=param_grid_knn, cv=cross_validator, scoring='neg_mean_squared_error', verbose=1)

# Perform grid search to find the best parameters based on cross-validation
grid_search_knn.fit(X_train, y_train)

# Get the best hyperparameters and model performance
best_params = grid_search_knn.best_params_
best_score = grid_search_knn.best_score_

print(f"Best parameters: {best_params}")
print(f"Best score (negative mean squared error): {best_score}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'n_neighbors': 17, 'p': 1, 'weights': 'uniform'}
Best score (negative mean squared error): -1.0715653058012764


In [76]:
# Output the best parameters and the best cross-validation score
print("Tuned Parameters: ", grid_search_knn.best_params_)
print("Best Cross-Validation Score: ", grid_search_knn.best_score_)

# Get the best k-NN model from the grid search
best_knn_model = grid_search_knn.best_estimator_

# Fit the best model to the training data
best_knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_knn_model.predict(X_test)

# Calculate and print regression metrics using y_pred
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")


Tuned Parameters:  {'n_neighbors': 17, 'p': 1, 'weights': 'uniform'}
Best Cross-Validation Score:  -1.0715653058012764
Mean Squared Error (MSE): 1.0695405387987844
Mean Absolute Error (MAE): 0.8312078484520644
R^2 Score: 0.06207947475839404
