In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset, try specifying the encoding
df = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin-1')


In [4]:
print(df.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [10]:
# Separate features (X) and target variable (y)
X = df[['Genre', 'Director', 'Actor 1', 'Actor 2']] # Example features, adjust as per your dataset
y = df['Rating']  # Target variable

# Handle missing values in the target variable (y)
# Option 1: Remove rows with missing ratings
df_cleaned = df.dropna(subset=['Rating'])  # Create a new DataFrame without missing ratings
X = df_cleaned[['Genre', 'Director', 'Actor 1', 'Actor 2']]
y = df_cleaned['Rating']

# Perform one-hot encoding for categorical variables (genre, director, actors)
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R2): {r2}')

# Example to print coefficients (feature importances for linear regression)
print("Coefficients:", model.coef_)

# Example prediction for a new movie
# Suppose you have a new movie with genre 'Action', director 'John Doe', and actors 'Actor A, Actor B'
new_movie = [['Action', 'John Doe', 'Actor A', 'Actor B']] # Adjusted to match the number of features
new_movie_encoded = encoder.transform(new_movie)
predicted_rating = model.predict(new_movie_encoded)
print(f'Predicted rating for the new movie: {predicted_rating[0]}')

Mean Squared Error (MSE): 3.471247334121333
R-squared (R2): -0.8671201393784889
Coefficients: [-0.57985634 -0.06465365  0.44892296 ...  0.2828926   0.
  0.3505851 ]
Predicted rating for the new movie: 5.253091423102215


