In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel

In [86]:
train_df = pd.read_csv("movie_ratings_train.csv")
test_df = pd.read_csv("movie_ratings_test.csv")
movies_df = pd.read_csv("movies.csv")

In [87]:
# Check for duplicates in the training dataset
train_duplicates = train_df.duplicated()
print("Duplicate rows in training data:")
print(train_duplicates.value_counts())

Duplicate rows in training data:
False    90836
Name: count, dtype: int64


In [88]:
# Concatenating the datasets
full_df = pd.concat([train_df, test_df], ignore_index=True)

# View the combined data
full_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,81834,,1493846730
100832,610,87232,,1493845469
100833,610,91500,,1493845427
100834,610,91658,,1493845240


In [89]:
movies_df['moviename'] = movies_df['title'].str.extract(r'^(.*)\s\((\d{4})\)$')[0]
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')
movies_df.head()


Unnamed: 0,movieId,title,genres,moviename,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995


In [90]:
movies_df.head()

Unnamed: 0,movieId,title,genres,moviename,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995


In [91]:
merged_df = pd.merge(full_df, movies_df, on ='movieId', how='left') 

merged_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,moviename,year
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,Heat,1995
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,Seven (a.k.a. Se7en),1995
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,"Usual Suspects, The",1995


In [92]:

merged_df.drop(columns=['title', 'timestamp'], inplace= True, axis= 1 )


In [93]:
merged_df.head()

Unnamed: 0,userId,movieId,rating,genres,moviename,year
0,1,1,4.0,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,1,3,4.0,Comedy|Romance,Grumpier Old Men,1995
2,1,6,4.0,Action|Crime|Thriller,Heat,1995
3,1,47,5.0,Mystery|Thriller,Seven (a.k.a. Se7en),1995
4,1,50,5.0,Crime|Mystery|Thriller,"Usual Suspects, The",1995


In [94]:
# One-hot encode the 'Category' column
merged_df_encoded = pd.get_dummies(merged_df, columns=['genres', 'moviename'])


In [95]:
merged_df_encoded.head()

Unnamed: 0,userId,movieId,rating,year,genres_(no genres listed),genres_Action,genres_Action|Adventure,genres_Action|Adventure|Animation,genres_Action|Adventure|Animation|Children,genres_Action|Adventure|Animation|Children|Comedy,...,moviename_Zulu,moviename_[REC],moviename_[REC]²,moviename_[REC]³ 3 Génesis,moviename_anohana: The Flower We Saw That Day - The Movie,moviename_eXistenZ,moviename_xXx,moviename_xXx: State of the Union,moviename_¡Three Amigos!,moviename_À nous la liberté (Freedom for Us)
0,1,1,4.0,1995,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,3,4.0,1995,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,6,4.0,1995,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,47,5.0,1995,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,50,5.0,1995,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [96]:
condition = merged_df_encoded['rating'].isnull()

In [97]:
train_data = merged_df_encoded[~condition]

test_data = merged_df_encoded[condition]

In [98]:
X = train_data.drop(columns=['rating'])  
y = train_data['rating']  

In [99]:
# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [100]:
# Step 2: Define the range of alpha values to test
alphas = np.logspace(-6, 6, 200)

best_alpha = None
best_score = float('inf')  # Start with an infinitely large score to minimize

# Step 3: Loop through alpha values and evaluate performance
for alpha in alphas:
    # Apply Lasso for feature selection with a specific alpha
    lasso = Lasso(alpha=alpha, random_state=42)  # Test a single alpha value
    
    lasso.fit(X_train, y_train)
    
    # Step 4: Select features with non-zero coefficients (feature selection)
    sfm = SelectFromModel(lasso, threshold=0.01)  # Select features with non-zero coefficients
    X_train_selected = sfm.transform(X_train)
    X_test_selected = sfm.transform(X_test)
    
    # Skip iteration if no features are selected
    if X_train_selected.shape[1] == 0:
        print(f"Warning: No features selected for alpha={alpha}. Skipping...")
        continue
    
    # Step 5: Apply Random Forest Regressor with selected features
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train_selected, y_train)
    
    # Step 6: Make predictions
    y_pred = rf.predict(X_test_selected)
    
    # Step 7: Evaluate the model (using Mean Squared Error)
    mse = mean_squared_error(y_test, y_pred)
    
    # Update best_alpha if we get a lower MSE
    if mse < best_score:
        best_score = mse
        best_alpha = alpha

print(f"Best alpha: {best_alpha} with MSE: {best_score:.2f}")


MemoryError: Unable to allocate 5.10 GiB for an array with shape (9422, 72668) and data type float64

In [None]:
# # Setting up cross-validated Lasso Model
# lassoCV = LassoCV(alphas=np.logspace(-6, 6, 200), # Creating range of alphas from 0.000001 to 1000000 for 200 evenly spaced samples  
#                   cv=3,
#                   random_state=1,
#                   max_iter= 12500,
#                   n_jobs=-1)

# # Fitting a lasso model on training X and Y values
# tuned_lasso = lassoCV.fit(X_train, y_train)

# # Displaying the optimum Lasso the minimizes the Root mean squared error
# print(f"Optimum Lasso: {tuned_lasso.alpha_}")

# Optimum_lasso = tuned_lasso.alpha_




# # # Obtaining the Training RMSE obtained from the optimum lasso
# # training_mse = np.min(tuned_lasso.mse_path_.mean(1))
# # print(f'Training RMSE: {math.sqrt(training_mse)}')

# # # Predicting on the validation dataset predictors
# # y_valid_pred = tuned_lasso.predict(x_valid)

# # # Obtaining the Test RMSE based on predictions obtained from the optimum lasso
# # validation_rmse = root_mean_squared_error(y_valid, y_valid_pred)

# # print("Validation RMSE:", validation_rmse)

MemoryError: Unable to allocate 5.10 GiB for an array with shape (9422, 72668) and data type float64

In [None]:
# Extracting feature names with non-zero coefficients
selected_features = [X_train.columns[i] for i in range(len(tuned_lasso.coef_)) if tuned_lasso.coef_[i] != 0]
print(len(selected_features))
print("Selected Features:", selected_features)

3
Selected Features: ['userId', 'movieId', 'timestamp']
