In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn import preprocessing 

In [None]:
# Load the dataset
df=pd.read_csv("D:/Documents/Data Sets/movie_dataset.csv")

In [None]:
# Display the first 5 rows of the dataset
df.head(5)

In [None]:
# Get the shape of the dataset
df.shape

In [None]:
# Check for missing values in the dataset
df.isnull().sum()

In [None]:
# Drop rows with missing values
df.dropna(inplace=True)

In [None]:
# Verify that there are no more missing values
df.isnull().sum()

In [None]:
# Display information about the dataset
df.info()

In [None]:
# Calculate and display correlation between budget and revenue
cor=df['budget'].corr(df['revenue'])
cor

In [None]:
# Encode categorical variables using Label Encoding
lr=preprocessing.LabelEncoder()
df['title']=lr.fit_transform(df['title'])
df['original_title']=lr.fit_transform(df['original_title'])
df['original_language']=lr.fit_transform(df['original_language'])
df['status']=lr.fit_transform(df['status'])
df['spoken_languages']=lr.fit_transform(df['spoken_languages'])
df['production_countries']=lr.fit_transform(df['production_countries'])
df['production_companies']=lr.fit_transform(df['production_companies'])
df['genres']=lr.fit_transform(df['genres'])
df['overview']=lr.fit_transform(df['overview'])
df['release_date']=lr.fit_transform(df['release_date'])


In [None]:
# Display information about the dataset after encoding
df.info()

In [None]:
# Create and display a heatmap of feature correlations
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title('Heatmap of Feature Correlations', fontsize=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Define features and target variable for the model
X=df[['budget','popularity','runtime']]
Y=df['revenue']


In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test=train_test_split(X,Y, test_size=0.4)

In [None]:
# Create a Linear Regression model
lr=LinearRegression()

In [None]:
# Fit the model to the training data
lr.fit(x_train, y_train)

In [None]:
# Make predictions on the testing set
pred=lr.predict(x_test)

In [None]:
# Evaluate the model using Mean Absolute Error
print(metrics.mean_absolute_error(y_test, pred))