In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('mov.csv', encoding='ISO-8859-1')

# Step 1: Explore the dataset
print(df.head())  # View the first few rows of the dataset
print(df.info())  # Check column data types and for missing values
print(df.describe())  # Statistical summary of numerical features

# Step 2: Drop rows with missing 'Rating' values (since it's the target variable)
df = df.dropna(subset=['Rating'])

# Step 3: Keep only relevant columns
df = df[['Name', 'Year', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Duration', 'Votes', 'Rating']]

# Step 4: Convert 'Year', 'Votes', and 'Rating' to numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Step 5: Handle 'Duration'
df['Duration'] = df['Duration'].astype(str)
df['Duration'] = df['Duration'].str.replace(' min', '', regex=False)  # Remove ' min' text
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

# Step 6: One-hot encode categorical variables ('Genre', 'Director', 'Actor 1', 'Actor 2')
df = pd.get_dummies(df, columns=['Genre', 'Director', 'Actor 1', 'Actor 2'], drop_first=True)

# Step 7: Drop any remaining non-numeric columns that are not relevant (like 'Name')
df = df.drop('Name', axis=1)

# Step 8: Check for any remaining non-numeric columns
print("Remaining non-numeric columns:", df.select_dtypes(include=['object']).columns)

# Step 9: Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Impute missing values with the mean
X = df.drop('Rating', axis=1)  # Features
y = df['Rating']  # Target

# Step 10: Perform imputation
X_imputed = imputer.fit_transform(X)  # Impute missing values

# Step 11: Convert back to DataFrame without reassigning the column names yet
X_imputed = pd.DataFrame(X_imputed)

# Step 12: Check the shapes before assigning column names
print("Shape of X_imputed:", X_imputed.shape)
print("Original number of columns in X:", X.shape[1])

# Step 13: Ensure we assign the correct number of column names
if X_imputed.shape[1] == X.shape[1]:
    X_imputed.columns = X.columns  # Assign column names if shapes match
else:
    print("Warning: The number of columns after imputation does not match the original columns.")

# Step 14: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Step 15: Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)  # Train the model on the training data

# Step 16: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 17: Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Step 18: Output the results
print("Mean Squared Error:", mse)  # Lower MSE indicates better performance
print("R-squared:", r2)  # R-squared closer to 1 means a better model fit


                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    



Shape of X_imputed: (7919, 8993)
Original number of columns in X: 8994
Mean Squared Error: 1.5227435117029856e+16
R-squared: -8190557468669242.0
