### Importing Liabraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv("../input/imdb-india-movies/IMDb Movies India.csv", encoding='ISO-8859-1')
df.head()

### Data Preprocessing:

In [None]:
def dataoveriew(df, message):
    print(f'{message}:\n')
    print("Rows:", df.shape[0])
    print("\nNumber of features:", df.shape[1])
    print("\nFeatures:")
    print(df.columns.tolist())
    print("\nMissing values:", df.isnull().sum().values.sum())
    print("\nUnique values:")
    print(df.nunique())

In [None]:
dataoveriew(df, 'Overiew of the training dataset')

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
#genre, director, and actors values counts
df['Genre'].value_counts()

In [None]:
df['Director'].value_counts()

In [None]:
df['Actor 1'].value_counts()

In [None]:
df.head(10)

In [None]:
# As we are going to predict movie ratings based on features, we need to remove null values from features that can directly influence the results.
df.dropna(subset=['Name','Year','Duration','Votes','Rating'],inplace=True) 
df.isna().sum()

In [None]:
df.head()

In [None]:
dataoveriew(df, 'Overiew of the training dataset')

In [None]:
# Remove parentheses from 'Year' column and convert to integer
df['Year'] = df['Year'].str.strip('()').astype(int)

In [None]:
# Remove commas from 'Votes' column and convert to integer
df['Votes'] = df['Votes'].str.replace(',', '').astype(int)

In [None]:
# Remove min from 'Duration' column andDurationonvert to integer
df['Duration'] = df['Duration'].str.replace('min', '').astype(int)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Drop Genre column 
df.drop('Genre',axis=1,inplace=True)

In [None]:
df.head()

### Exploratory Data Analysis

In [None]:
plt.figure(figsize=(14,7))
plt.subplot(2,2,1)
sns.boxplot(x='Votes',data=df)

plt.subplot(2,2,2)
sns.distplot(df['Year'],color='g')

plt.subplot(2,2,3)
sns.distplot(df['Rating'],color='g')

plt.subplot(2,2,4)
sns.scatterplot(x=df['Duration'], y=df['Rating'], data=df)

plt.tight_layout()
plt.show()

In [None]:
df.hist(figsize=(30, 15))
None

In [None]:
# Heatmap
corrmat = df.corr()
fig = plt.figure(figsize = (20, 5))

sns.heatmap(corrmat, vmax = .8, square = True, annot = True)
plt.show()

In [None]:
df.head()

### Feature Engineering:

In [None]:
df.drop(['Name','Director','Actor 1','Actor 2','Actor 3'], axis=1,inplace=True)
df.head()

In [None]:
X = df[['Year','Duration','Votes']]
y = df['Rating']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1000)

### Model Buiding:

In [None]:
# Create a pipeline with standard scaling and SGD regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('sgd', SGDRegressor(max_iter=10000, random_state=1000))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
# Predict ratings on the test set
y_pred_pipeline = pipeline.predict(X_test)

### Model Evaluation:

In [None]:
# Evaluation Metrics for the Pipeline
mae_pipeline = mean_absolute_error(y_test, y_pred_pipeline)
mse_pipeline = mean_squared_error(y_test, y_pred_pipeline)
r2_pipeline = r2_score(y_test, y_pred_pipeline)

In [None]:
print("Pipeline Mean Absolute Error:", mae_pipeline)
print("Pipeline Mean Squared Error:", mse_pipeline)
print("Pipeline R-squared:", r2_pipeline)

### Model Deployment:

In [None]:
# Take new user input for prediction
new_input = pd.DataFrame({
    'Year': [2023],          # Replace with the desired year
    'Duration': [120],       # Replace with the desired duration in minutes
    'Votes': [10000],        # Replace with the desired number of votes
})

# Use the trained pipeline to make predictions on the input
predicted_rating = pipeline.predict(new_input)

print("Predicted Rating:", predicted_rating)
