In [219]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.feature_selection import SelectFromModel
from gensim.models import Word2Vec
import math

In [220]:
train_df = pd.read_csv("movie_ratings_train.csv")
test_df = pd.read_csv("movie_ratings_test.csv")
movies_df = pd.read_csv("movies.csv")

In [221]:
# Check for duplicates in the training dataset
train_duplicates = train_df.duplicated()
print("Duplicate rows in training data:")
print(train_duplicates.value_counts())

Duplicate rows in training data:
False    90836
Name: count, dtype: int64


In [222]:
# Concatenating the datasets
full_df = pd.concat([train_df, test_df], ignore_index=True)

# View the combined data
full_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,81834,,1493846730
100832,610,87232,,1493845469
100833,610,91500,,1493845427
100834,610,91658,,1493845240


In [223]:
# Extract the movie name
movies_df['moviename'] = movies_df['title'].str.extract(r'^(.*?)(?:\s\((\d{4}(?:–\d{4})?)\))?$')[0]

# Extract the first year (only the first four digits if a range exists)
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})')[0]
movies_df.head()


Unnamed: 0,movieId,title,genres,moviename,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995


In [224]:
movies_df[movies_df["year"].isnull()]

Unnamed: 0,movieId,title,genres,moviename,year
6059,40697,Babylon 5,Sci-Fi,Babylon 5,
9031,140956,Ready Player One,Action|Sci-Fi|Thriller,Ready Player One,
9091,143410,Hyena Road,(no genres listed),Hyena Road,
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),The Adventures of Sherlock Holmes and Doctor W...,
9179,149334,Nocturnal Animals,Drama|Thriller,Nocturnal Animals,
9259,156605,Paterson,(no genres listed),Paterson,
9367,162414,Moonlight,Drama,Moonlight,
9448,167570,The OA,(no genres listed),The OA,
9514,171495,Cosmos,(no genres listed),Cosmos,
9515,171631,Maria Bamford: Old Baby,(no genres listed),Maria Bamford: Old Baby,


In [225]:
movie_dict = {
    "Babylon 5":  1994,
    "Ready Player One": 2018,
    "Hyena Road": 2015,
    "The Adventures of Sherlock Holmes and Doctor Watson": 1979,
    "Nocturnal Animals": 2016,
    "Paterson": 2016,
    "Moonlight": 2016,
    "The OA": 1990,
    "Cosmos": 2019,
    "Maria Bamford: Old Baby": 2017,
    "Generation Iron 2": 2017,
    "Black Mirror": 2018,
}

In [226]:
# Replace missing years using the dictionary
movies_df['year'] = movies_df['moviename'].map(movie_dict).fillna(movies_df['year'])

# Convert Year to integers (replace NaN first, if necessary)
movies_df['year'] = movies_df['year'].astype(int)

In [227]:
# index = movies_df[movies_df["genres"]=="(no genres listed)"].index
# movies_df.loc[index]

In [228]:
movies_df.head()

Unnamed: 0,movieId,title,genres,moviename,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995


In [229]:
merged_df = pd.merge(full_df, movies_df, on ='movieId', how='left') 

merged_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,moviename,year
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,Heat,1995
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,Seven (a.k.a. Se7en),1995
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,"Usual Suspects, The",1995


In [230]:

merged_df.drop(columns=['title', 'timestamp'], inplace= True, axis= 1 )


In [231]:
merged_df.head()

Unnamed: 0,userId,movieId,rating,genres,moviename,year
0,1,1,4.0,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,1,3,4.0,Comedy|Romance,Grumpier Old Men,1995
2,1,6,4.0,Action|Crime|Thriller,Heat,1995
3,1,47,5.0,Mystery|Thriller,Seven (a.k.a. Se7en),1995
4,1,50,5.0,Crime|Mystery|Thriller,"Usual Suspects, The",1995


In [232]:
merged_df.isnull().sum()

userId           0
movieId          0
rating       10000
genres           0
moviename        0
year             0
dtype: int64

In [233]:
# Word2Vec for moviename
merged_df['moviename_tokens'] = merged_df['moviename'].str.lower().str.split()
merged_df['genres_tokens'] = merged_df['genres'].str.lower().str.split()
sentences = merged_df['moviename_tokens'].tolist() + merged_df['genres_tokens'].tolist()
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Aggregate embeddings for each record
def aggregate_embeddings(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:  # Check if the list is not empty
        return np.mean(vectors, axis=0)  # Aggregate using mean
    else:
        return np.zeros(model.vector_size)  # Handle cases where no tokens are in the model

merged_df['moviename_embedding'] = merged_df['moviename_tokens'].apply(lambda tokens: aggregate_embeddings(tokens, model))
merged_df['genres_embedding'] = merged_df['genres_tokens'].apply(lambda tokens: aggregate_embeddings(tokens, model))

# Convert embeddings to separate DataFrame
embedding_columns = ['embedding_' + str(i) for i in range(model.vector_size)]
moviename_embeddings_df = pd.DataFrame(merged_df['moviename_embedding'].tolist(), columns=['moviename_' + col for col in embedding_columns])
genres_embeddings_df = pd.DataFrame(merged_df['genres_embedding'].tolist(), columns=['genres_' + col for col in embedding_columns])

# Combine embeddings with original numerical columns
combined_df = pd.concat([merged_df[['rating', 'year']], moviename_embeddings_df, genres_embeddings_df], axis=1)


display(combined_df)


Unnamed: 0,rating,year,moviename_embedding_0,moviename_embedding_1,moviename_embedding_2,moviename_embedding_3,moviename_embedding_4,moviename_embedding_5,moviename_embedding_6,moviename_embedding_7,...,genres_embedding_90,genres_embedding_91,genres_embedding_92,genres_embedding_93,genres_embedding_94,genres_embedding_95,genres_embedding_96,genres_embedding_97,genres_embedding_98,genres_embedding_99
0,4.0,1995,-0.755446,-0.791135,0.346317,0.292695,-0.862608,-1.727786,-0.949643,1.299842,...,0.002450,0.000901,-0.009923,0.002991,0.007183,0.003989,0.002472,-0.003548,0.002340,-0.007633
1,4.0,1995,0.253690,0.126798,1.068116,-0.101448,-0.315592,-1.095394,-0.204204,0.598742,...,0.009071,0.008938,-0.008208,-0.003012,0.009887,0.005104,-0.001588,-0.008692,0.002962,-0.006676
2,4.0,1995,-0.026210,0.003969,0.151767,-0.014253,0.076049,-0.202009,0.007247,0.178698,...,0.007012,0.004829,0.008683,0.007094,-0.005694,0.007241,-0.009295,-0.002588,-0.007757,0.004193
3,5.0,1995,-0.156285,-0.150410,1.089430,-0.599133,0.433864,-1.665902,0.066529,0.881334,...,-0.006826,-0.003619,0.009937,0.005676,0.001596,-0.003494,0.004417,0.002532,0.008358,-0.000002
4,5.0,1995,-0.025828,-0.256255,0.407373,0.051606,-0.418241,-0.615699,0.197824,0.805265,...,0.000727,-0.001657,0.006248,-0.003234,-0.004053,-0.008478,-0.003468,0.002827,-0.001695,0.000807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,,2010,-1.216333,-0.442747,0.048823,0.021069,-0.066570,-1.395004,-1.137117,0.717975,...,0.003759,-0.002840,0.007913,-0.006181,0.005231,-0.001977,0.007500,-0.001282,0.008403,0.008658
100832,,2011,-0.906493,-0.828217,0.253282,0.239716,-0.465846,-1.121098,-0.410370,0.063499,...,-0.001945,-0.001657,0.006611,-0.009630,0.004056,-0.000957,0.007996,-0.002234,-0.007999,-0.003678
100833,,2012,-0.198941,-0.292509,0.134355,-0.316059,-0.464739,-0.659486,-0.063459,0.773165,...,0.003249,-0.002830,-0.008570,-0.006258,-0.009462,0.008438,-0.009379,0.007231,0.009262,-0.006261
100834,,2011,-0.167588,0.175141,0.621650,-0.106964,-0.556411,-1.091580,-0.441783,1.888779,...,-0.000959,0.001310,-0.008595,0.008749,-0.009208,-0.009625,-0.008512,0.007313,0.005466,0.009249


In [234]:
condition = combined_df['rating'].isnull()

In [235]:
train_data = combined_df[~condition]

test_data = combined_df[condition]

In [236]:
X = train_data.drop(columns=['rating'])  
y = train_data['rating']  

In [237]:
# Step 1: Split the data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [240]:
test_data.drop(columns="rating", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop(columns="rating", axis=1, inplace=True)


# Model One: Random Forest

In [242]:
# Initialize and train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Step 3: Predict on the training set
y_train_pred = model.predict(X_train)

# Step 4: Predict on the validation set
y_val_pred = model.predict(X_valid)

# Step 5: Calculate the MSE
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_valid, y_val_pred)

print(f"Training MSE: {train_mse}")
print(f"Validation MSE: {val_mse}")

Training MSE: 0.7575535925824948
Validation MSE: 0.932731399773928


In [244]:
# Step 4: Predict on the validation set
y_pred_test = model.predict(test_data)

rounded_test_predictions = y_pred_test.tolist()

# Print the final binary predictions
print(sorted(rounded_test_predictions))

[2.1893048864630806, 2.1893048864630806, 2.1893048864630806, 2.1893048864630806, 2.1893048864630806, 2.1893048864630806, 2.1893048864630806, 2.1893048864630806, 2.1893048864630806, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.2272391876740154, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.3462677041929982, 2.394832722431278, 2.394832722431278, 2.394832722431278, 2.394832722431278, 2.394832722431278, 2.394832722431278, 2.394832722431278, 2.394832722431278, 2.4526719369532737, 2.4526719369532737, 2.4526719369532737, 2.45267

# Model Two: Neural Networks