#### Importing Libraries

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
import warnings
warnings.filterwarnings("ignore")

#### Reading in data

In [2]:
X= pd.read_csv('added_data.csv')

In [3]:
X.head()

Unnamed: 0,movieId,rating,genres,imdbId,directors,cast,writer
0,1,3.92093,Adventure|Animation|Children|Comedy|Fantasy,114709,John Lasseter,Tom Hanks,John Lasseter
1,2,3.431818,Adventure|Children|Fantasy,113497,Joe Johnston,Robin Williams,Jonathan Hensleigh
2,3,3.259615,Comedy|Romance,113228,Howard Deutch,Walter Matthau,Mark Steven Johnson
3,4,2.357143,Comedy|Drama|Romance,114885,Forest Whitaker,Whitney Houston,Terry McMillan
4,5,3.071429,Comedy,113041,Charles Shyer,Steve Martin,Albert Hackett


In [4]:
X.shape

(9447, 7)

#### Get sum of all the movies director/cast/writer directed and total score

In [5]:
c= X['directors'].value_counts()
d= X['cast'].value_counts()
e= X['writer'].value_counts()

f= X.drop(columns= ['movieId', 'genres', 'imdbId', 'cast', 'writer'])
f= f.groupby(['directors']).sum()
g= X.drop(columns= ['movieId', 'genres', 'imdbId', 'directors', 'writer'])
g= g.groupby(['cast']).sum()
h= X.drop(columns= ['movieId', 'genres', 'imdbId', 'cast', 'directors'])
h= h.groupby(['writer']).sum()

#### Calculating Attribute Ranks

In [6]:
directors= []
cast= []
writer= []
    
for i, row in X.iterrows():
    
    row_director= X['directors'].loc[i]
    row_cast= X['cast'].loc[i]
    row_writer= X['writer'].loc[i]
    
    directors.append(f['rating'][row_director]*(c[row_director]))
    cast.append(g['rating'][row_cast]*(d[row_cast]))
    writer.append(h['rating'][row_writer]*(e[row_writer]))

#### Adding coluns to table

In [7]:
directors= pd.DataFrame(directors)
X= pd.concat([X, directors], axis=1) 
X= X.rename(columns={0: "director"})

cast= pd.DataFrame(cast)
X= pd.concat([X, cast], axis=1) 
X= X.rename(columns={0: "casts"})

writer= pd.DataFrame(writer)
X= pd.concat([X, writer], axis=1) 
X= X.rename(columns={0: "writers"})

#### Filtering low frequency instances of data

In [8]:
for i, row in X.iterrows():
    row_director= X['directors'].loc[i]
    row_cast= X['cast'].loc[i]
    row_writer= X['writer'].loc[i]
    if(c[row_director]<=10 and d[row_cast]<=10 and e[row_writer]<=10):
        X= X.drop(i)

In [9]:
X= X.drop(columns= ['directors', 'cast', 'writer', 'movieId', 'imdbId'])

#### Categorical Variables

In [10]:
genres =X['genres'].str.get_dummies()
X = pd.concat([X, genres], axis=1)
del X['genres']

In [11]:
train, test = train_test_split(X, test_size=0.3)

In [12]:
train_y= train[['rating']]
train_x= train.drop(columns= ['rating'])
test_y= test[['rating']]
test_x= test.drop(columns= ['rating'])

#### Cross Validation to find best parameters

In [13]:
param_grid = {
              "max_depth": [1,2,3,4,5,6,7,8,9,10]
              }
dt = DecisionTreeRegressor()
dt.fit(train_x,train_y)
grid_search = GridSearchCV(dt, param_grid, cv=5)
grid_search.fit(train_x, train_y)
best_no_leaves= grid_search.best_params_

In [14]:
dt = DecisionTreeRegressor(max_depth= 15)
dt= dt.fit(train_x,train_y)

#### Evaluating scores

In [15]:
y_pred= dt.predict(train_x)
print("Mean Squared Error: ", mean_squared_error(train_y, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(train_y, y_pred))
print("R2 Score: ", r2_score(train_y, y_pred))

Mean Squared Error:  0.16681206731376366
Mean Absolute Error:  0.20366557588378895
R2 Score:  0.7502022971640457


In [16]:
y_pred= dt.predict(test_x)
print("Mean Squared Error: ", mean_squared_error(test_y, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(test_y, y_pred))
print("R2 Score: ", r2_score(test_y, y_pred))

Mean Squared Error:  0.3233722446724285
Mean Absolute Error:  0.3050904606437739
R2 Score:  0.50854334442355
