In [5]:
# Import necessary libraries
import pandas as pd

# Load dataset from a CSV file
file_path = "/content/imdb-videogames[1].csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Initial Dataset Preview:")
print(data.head())

# Remove commas from 'votes' column and convert to numeric
if 'votes' in data.columns:
    data['votes'] = data['votes'].str.replace(',', '').astype(float)

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# Drop rows with missing target variable ('rating') values
data_cleaned = data.dropna(subset=['rating'])

# Handle missing values in feature columns
numeric_features = ['year', 'votes']
data_cleaned[numeric_features] = data_cleaned[numeric_features].fillna(data_cleaned[numeric_features].mean())

genre_features = ['Action', 'Adventure', 'Comedy', 'Crime', 'Family', 'Fantasy', 'Mystery', 'Sci-Fi', 'Thriller']
data_cleaned[genre_features] = data_cleaned[genre_features].fillna(0)

# Remove duplicates if any
data_cleaned = data_cleaned.drop_duplicates()
data_cleaned[numeric_features] = data_cleaned[numeric_features].apply(pd.to_numeric, errors='coerce')

# Display the cleaned dataset
print("\nCleaned Dataset Preview:")
print(data_cleaned.head())

# Save the cleaned dataset to a new file (optional)
cleaned_file_path = "cleaned_dataset.csv"
data_cleaned.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved to {cleaned_file_path}")


Initial Dataset Preview:
   Unnamed: 0                        name  \
0           0                  Spider-Man   
1           1      Red Dead Redemption II   
2           2          Grand Theft Auto V   
3           3                  God of War   
4           4  Uncharted 4: A Thief's End   

                                                 url    year certificate  \
0  https://www.imdb.com/title/tt5807780/?ref_=adv...  2018.0           T   
1  https://www.imdb.com/title/tt6161168/?ref_=adv...  2018.0           M   
2  https://www.imdb.com/title/tt2103188/?ref_=adv...  2013.0           M   
3  https://www.imdb.com/title/tt5838588/?ref_=adv...  2018.0           M   
4  https://www.imdb.com/title/tt3334704/?ref_=adv...  2016.0           T   

   rating   votes                                               plot  Action  \
0     9.2  20,759  When a new villain threatens New York City, Pe...    True   
1     9.7  35,703  Amidst the decline of the Wild West at the tur...    True   
2     9

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned[numeric_features] = data_cleaned[numeric_features].fillna(data_cleaned[numeric_features].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned[genre_features] = data_cleaned[genre_features].fillna(0)


In [9]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Use the cleaned dataset from your earlier process
cleaned_file_path = "/content/cleaned_dataset.csv"
data_cleaned = pd.read_csv(cleaned_file_path)

# Feature Selection
X = data_cleaned[['year', 'votes', 'Action', 'Adventure', 'Comedy', 'Crime', 'Family', 'Fantasy', 'Mystery', 'Sci-Fi', 'Thriller']]
y = data_cleaned['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Display evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")


Mean Squared Error (MSE): 1.3762249524635954
Mean Absolute Error (MAE): 0.8559535528743287


In [11]:
# Ensure indices are reset for consistency
y_test = y_test.reset_index(drop=True)
y_pred = pd.Series(y_pred, name="Predicted Rating")
X_test = X_test.reset_index(drop=True)
game_names = data_cleaned.loc[X_test.index, 'name'].reset_index(drop=True)

error_analysis = pd.DataFrame({
    'Game': game_names,
    'Actual Rating': y_test,
    'Predicted Rating': y_pred,
    'Error': y_test - y_pred
})

# Sort by the largest absolute errors
error_analysis['Absolute Error'] = abs(error_analysis['Error'])
error_analysis = error_analysis.sort_values(by='Absolute Error', ascending=False)

# Display the top 5 samples with the largest errors
print("\nTop 5 Samples with Largest Errors:")
print(error_analysis.head())



Top 5 Samples with Largest Errors:
                                         Game  Actual Rating  \
1109  Ratchet & Clank Future: A Crack in Time            1.4   
2034                                    Rambo            1.3   
1841                               Drakengard            2.2   
169          Wolfenstein II: The New Colossus            1.8   
80                                  Minecraft            2.0   

      Predicted Rating     Error  Absolute Error  
1109          7.467000 -6.067000        6.067000  
2034          6.971700 -5.671700        5.671700  
1841          7.565833 -5.365833        5.365833  
169           6.900000 -5.100000        5.100000  
80            7.051000 -5.051000        5.051000  
