<a href="https://colab.research.google.com/github/prabhajeeva/CODSOFT/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Movie Rating Prediction with Python
# Google Colab Compatible

# Step 1: Install & Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# Step 2: Load Dataset
# You can upload your CSV in Colab
from google.colab import files

uploaded = files.upload()

# Replace 'movies.csv' with your uploaded file name
# Added encoding='latin-1' to handle potential encoding issues
df = pd.read_csv("IMDb Movies India.csv", encoding='latin-1')

# Step 3: Explore Data
print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:\n", df.head())
print("\nMissing Values:\n", df.isnull().sum())

# Step 4: Preprocessing
# Fill missing values if any
df['Genre'] = df['Genre'].fillna('Unknown')
df['Director'] = df['Director'].fillna('Unknown')
df['Actor 1'] = df['Actor 1'].fillna('Unknown') # Corrected column name based on previous output
df['Actor 2'] = df['Actor 2'].fillna('Unknown') # Corrected column name based on previous output
df['Actor 3'] = df['Actor 3'].fillna('Unknown') # Corrected column name based on previous output
df['Rating'] = df['Rating'].fillna(df['Rating'].mean()) # Corrected column name based on previous output

# Step 5: Feature Encoding
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'] # Corrected column names
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[categorical_features]).toarray() # Convert to dense array

# Create DataFrame for encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Combine with numeric features
# Dropping original categorical features and the target variable 'Rating' before concatenation
numeric_df = df.drop(columns=categorical_features + ['Rating'])
final_df = pd.concat([numeric_df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)


# Step 6: Prepare Data for Modeling
X = final_df
y = df['Rating'] # Corrected column name

# Step 7: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Build and Train Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 9: Evaluate Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nMean Squared Error:", mse)
print("R-squared:", r2)

# Step 10: Make Predictions (Example)
# Create a sample new movie data (ensure it has the same columns as final_df)
# Note: You'll need to encode new categorical data using the same encoder
# For simplicity, let's assume a new movie with average values
# You would replace this with actual data for a new movie
sample_data = final_df.mean().values.reshape(1, -1)
predicted_rating = model.predict(sample_data)
print("\nPredicted Rating for a Sample Movie:", predicted_rating[0])

Saving IMDb Movies India.csv to IMDb Movies India (5).csv
Dataset Shape: (15509, 10)

First 5 Rows:
                                  Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

         