In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

##### **Load the Dataset**

In [81]:
data=pd.read_csv('Rotten_Tomatoes_Movies3.csv')

##### **Basic Checks**

In [82]:
data.columns

Index(['movie_title', 'movie_info', 'critics_consensus', 'rating', 'genre',
       'directors', 'writers', 'cast', 'in_theaters_date', 'on_streaming_date',
       'runtime_in_minutes', 'studio_name', 'tomatometer_status',
       'tomatometer_rating', 'tomatometer_count', 'audience_rating'],
      dtype='object')

In [83]:
data.shape

(16638, 16)

In [84]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16638 entries, 0 to 16637
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_title         16638 non-null  object 
 1   movie_info          16614 non-null  object 
 2   critics_consensus   8309 non-null   object 
 3   rating              16638 non-null  object 
 4   genre               16621 non-null  object 
 5   directors           16524 non-null  object 
 6   writers             15289 non-null  object 
 7   cast                16354 non-null  object 
 8   in_theaters_date    15823 non-null  object 
 9   on_streaming_date   16636 non-null  object 
 10  runtime_in_minutes  16483 non-null  float64
 11  studio_name         16222 non-null  object 
 12  tomatometer_status  16638 non-null  object 
 13  tomatometer_rating  16638 non-null  int64  
 14  tomatometer_count   16638 non-null  int64  
 15  audience_rating     16386 non-null  float64
dtypes: f

In [85]:
data.describe()

Unnamed: 0,runtime_in_minutes,tomatometer_rating,tomatometer_count,audience_rating
count,16483.0,16638.0,16638.0,16386.0
mean,102.391494,60.466522,56.607104,60.470829
std,25.028011,28.58723,66.3838,20.462368
min,1.0,0.0,5.0,0.0
25%,90.0,38.0,12.0,45.0
50%,99.0,66.0,28.0,62.0
75%,111.0,86.0,76.0,77.0
max,2000.0,100.0,497.0,100.0


In [86]:
data.isnull().sum()

movie_title              0
movie_info              24
critics_consensus     8329
rating                   0
genre                   17
directors              114
writers               1349
cast                   284
in_theaters_date       815
on_streaming_date        2
runtime_in_minutes     155
studio_name            416
tomatometer_status       0
tomatometer_rating       0
tomatometer_count        0
audience_rating        252
dtype: int64

In [87]:
data=data.dropna() # drop rows with missing values

In [88]:
datetime_cols = data.select_dtypes(include=['datetime64']).columns
for col in datetime_cols:
    data[col] = data[col].astype(np.int64)

##### Missing value percentage of given dataset

In [89]:
missing_data = data.isnull().sum()*100/len(data)

missing_data = missing_data[missing_data > 0]

print("Missing Data: \n", missing_data)

Missing Data: 
 Series([], dtype: float64)


* If we observe the above output, we can see that the missing value percentage in **critics_consensus** feature is very high. So,  we can drop  that feature.
* Some features the missing value percentage is very low so we can impute those missing values with most frequent value.
* In numerical feature, we can impute those missing value with median value.

In [90]:
data=data.drop("critics_consensus",axis=1)

data.head()

Unnamed: 0,movie_title,movie_info,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,tomatometer_status,tomatometer_rating,tomatometer_count,audience_rating
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",12-02-2010,29-06-2010,83.0,20th Century Fox,Rotten,49,144,53.0
1,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",30-04-2010,19-10-2010,90.0,Sony Pictures Classics,Certified Fresh,86,140,64.0
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",13-04-1957,06-03-2001,95.0,Criterion Collection,Certified Fresh,100,51,97.0
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",01-01-1954,20-05-2003,127.0,Disney,Fresh,89,27,74.0
5,"10,000 B.C.",A young outcast from a primitive tribe is forc...,PG-13,"Action & Adventure, Classics, Drama",Roland Emmerich,"Roland Emmerich, Harald Kloser","Steven Strait, Camilla Belle, Cliff Curtis, Jo...",07-03-2008,24-06-2008,109.0,Warner Bros. Pictures,Rotten,8,148,37.0


##### Categorical feature

In [91]:
# Separate the categorical featured from the dataset

categorical_features = data.select_dtypes(include=['object'])

categorical_features.columns

Index(['movie_title', 'movie_info', 'rating', 'genre', 'directors', 'writers',
       'cast', 'in_theaters_date', 'on_streaming_date', 'studio_name',
       'tomatometer_status'],
      dtype='object')

##### Numerical Feature

In [92]:
# Separate the numerical featured from the dataset

numerical_features = data.select_dtypes(include=['int64', 'float64'])

numerical_features.columns

Index(['runtime_in_minutes', 'tomatometer_rating', 'tomatometer_count',
       'audience_rating'],
      dtype='object')

##### Creating a Pipeline

In [93]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn import set_config
set_config(display='diagram')

* ##### Numerical Processing Pipeline

In [94]:
# numeric_processor = Pipeline(
#     steps=[("imputation_median",SimpleImputer(missing_values=np.nan, strategy='median')),
#            ("scaler", StandardScaler())]
#     )

In [95]:
# numeric_processor

* ##### Categorical Processing Pipeline


In [96]:
# categorical_processor = Pipeline(
#     steps=[("imputation_mode",SimpleImputer(fill_value='missing', strategy='most_frequent')),
#               ("onehot", OneHotEncoder(handle_unknown='ignore'))]
#      )

In [97]:
# categorical_processor

##### Combine Processing Techniques

In [98]:
# preprocessor=ColumnTransformer(
#     [('numerical', numeric_processor, numerical_features.columns),
#      ('categorical', categorical_processor, categorical_features.columns),
#      ]
# )

In [99]:
# preprocessor

In [100]:
label_encoders = {}
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

##### Split the data X and y

In [101]:
X = data.drop('audience_rating', axis=1) # input data (features) 

y=data['audience_rating'] # target data (labels)

##### Split the data into train and test sets 


In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

##### Using the Linear Regression Algorithm with Pipeline

In [105]:
# pipe=make_pipeline(preprocessor, LinearRegression())

In [106]:
# pipe

In [107]:
# pipe.fit(X_train, y_train)

In [108]:
model=LinearRegression()
model.fit(X_train, y_train)

In [109]:
y_predict=model.predict(X_test)

In [110]:
r2=model.score(X_test, y_test)

In [111]:
r2

0.5344745732926015