### Importing the libraries

In [2]:
import pandas as pd
import numpy as np

### Importing the dataset

In [3]:
dataset=pd.read_csv('Movies_RatingData.csv')

### Arrange dataset

In [4]:
dataset.rename(columns={'num_critic_for_reviews':'reviews',
                        'num_voted_users':'votes',
                        'cast_total_facebook_likes':'total_likes',
                        },inplace='true')

In [5]:
dataset=dataset[[
         'movie_title',
         'movie_imdb_link',
         'color',
         'director_name',
         'actor_1_name',
         'actor_2_name',
         'actor_3_name',
         'genres',
         'plot_keywords',
         'country',
         'language',
         'director_facebook_likes',
         'actor_1_facebook_likes',
         'actor_2_facebook_likes',
         'actor_3_facebook_likes',
         'movie_facebook_likes',
         'total_likes',
         'reviews',
         'duration',
         'gross',
         'votes',
         'budget',
         'num_user_for_reviews',
         'facenumber_in_poster',
         'aspect_ratio',
         'title_year',
         'content_rating',
         'imdb_score']]

### Dividing the dataset

In [6]:
X=dataset.iloc[:,2:-1]
Y=dataset.iloc[:,-1]

## Taking care of missing values

In [7]:
from sklearn.impute import SimpleImputer
imputer_freq=SimpleImputer(missing_values=np.nan,strategy='most_frequent',verbose=0,copy=True)
imputer_mean=SimpleImputer(missing_values=np.nan,strategy='mean',verbose=0,copy=True)
X.iloc[:,[0,1,2,3,4,5,6,7,8,23,24]]=imputer_freq.fit_transform(X.iloc[:,[0,1,2,3,4,5,6,7,8,23,24]])
X.iloc[:,9:23]=imputer_mean.fit_transform(X.iloc[:,9:23])

###### For the efficiency

In [8]:
import datetime as current
X['title_year']=current.datetime.now().year-X['title_year']

## Encoding the categorical columns using OneHotEncoder

In [9]:
from sklearn.preprocessing import OneHotEncoder
oh=OneHotEncoder()
categorical_columns=['color','country','language','content_rating']
x_one_hot= pd.DataFrame(oh.fit_transform(X.loc[:,categorical_columns]).toarray())
X=pd.concat([ x_one_hot, X.drop(columns=categorical_columns) ],axis=1)

## Encoding the keywords using Vectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer()
tf=TfidfVectorizer(min_df=round(dataset.shape[0]/100))
x_genres=pd.DataFrame(cv.fit_transform(X['genres']).toarray())
x_plot=  pd.DataFrame(tf.fit_transform(X['plot_keywords']).toarray())
X=pd.concat([ x_genres,x_plot, X.drop(columns=['genres','plot_keywords'])],axis=1)

## Encoding the names using TargetEncoder

In [11]:
from category_encoders.target_encoder import TargetEncoder
te=TargetEncoder(verbose=0,drop_invariant=False,smoothing=9)
target_columns=['director_name','actor_1_name','actor_2_name','actor_3_name']
x_target=te.fit_transform(X.loc[:,target_columns],Y)
X=pd.concat([ X.drop(columns=target_columns), x_target ],axis=1)