<a href="https://colab.research.google.com/github/ramyasri497/codsoft/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from sklearn.impute import SimpleImputer

# Loading the dataset
df = pd.read_csv('imdb.csv')  # Replace with your actual filename

# Storing movie names for final output
movie_names_all = df['Movie_name']

# Drop unnecessary columns
df = df.drop(columns=['Rank', 'Movie_name'])

# Handling missing values.
# Converting Metascore and Gross to numeric.
df['Metascore'] = pd.to_numeric(df['Metascore'], errors='coerce')
df['Gross_n_$_M'] = pd.to_numeric(df['Gross_n_$_M'], errors='coerce')

# Imputing missing numerical values with median
num_cols = ['Metascore', 'Gross_n_$_M', 'Runtime_n_mn']
imputer = SimpleImputer(strategy='median')
df[num_cols] = imputer.fit_transform(df[num_cols])


# Encoding categorical variables
le_certificate = LabelEncoder()
df['Certificate'] = le_certificate.fit_transform(df['Certificate'].astype(str))

# Simplifying Genre: using the first listed genre
df['Genre'] = df['Genre'].astype(str).apply(lambda x: x.split(',')[0])
le_genre = LabelEncoder()
df['Genre'] = le_genre.fit_transform(df['Genre'])

# Defining features and target
X = df.drop(columns=['Rating_from_10'])
y = df['Rating_from_10']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

movie_names_test = movie_names_all.iloc[X_test.index]

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))

# Create result DataFrame
results_df = pd.DataFrame({
    'Movie Name': movie_names_test.values,
    'Predicted Rating': y_pred
})

# Show top results
print(results_df.head(10))

Mean Squared Error: 0.05
R² Score: 0.18
MAE: 0.18462499999999976
MSE: 0.05301710499999992
RMSE: 0.2302544353535886
R² Score: 0.1762925703809597
                                Movie Name  Predicted Rating
0  Harry Potter and the Prsoner of Azkaban             8.173
1                       The Asphalt Jungle             7.937
2                                  Gaslght             7.950
3                                 The Fall             7.875
4                                  Ran Man             8.258
5                        Jont Securty Area             7.957
6                           The Best Offer             7.876
7                                    3-ron             7.768
8                                    Naked             8.009
9                               Ctzen Kane             8.225
