## Bagging using SVM for IMDB movie ratings

The dataset is obtained from https://www.kaggle.com/karrrimba/movie-metadatacsv/home

In [1]:
# Importing libraries 
from random import seed
from random import randrange
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statistics as stat
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
# Load a CSV file
df = pd.read_csv("movie_metadata.csv")
print(df.shape)

(5043, 28)


In [3]:
print(len(df[round(df['imdb_score'])==1.0]))
print(len(df[round(df['imdb_score'])==2.0]))
print(len(df[round(df['imdb_score'])==3.0]))
print(len(df[round(df['imdb_score'])==9.0]))

0
20
70
48


In [4]:
df=df[round(df['imdb_score'])!=1.0]
df=df[round(df['imdb_score'])!=2.0]
df=df[round(df['imdb_score'])!=3.0]
df=df[round(df['imdb_score'])!=9.0]

In [5]:
df.shape

(4905, 28)

In [6]:
df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [7]:
df = df.drop(['movie_title','actor_3_name','content_rating','director_name','genres','language','country','title_year','actor_2_name','actor_1_name','plot_keywords','movie_imdb_link'], axis = 1)
df.head()

Unnamed: 0,color,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,936.0,7.9,1.78,33000
1,Color,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,5000.0,7.1,2.35,0
2,Color,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,393.0,6.8,2.35,85000
3,Color,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,23000.0,8.5,2.35,164000
4,,,,131.0,,131.0,,8,143,0.0,,,12.0,7.1,,0


In [8]:
df =df.dropna()
print(df.shape)

(3725, 16)


In [9]:
columnsToEncode = list(df.select_dtypes(include=['category','object']))
le = LabelEncoder()
for feature in columnsToEncode:
    try:
        df[feature] = le.fit_transform(df[feature])
    except:
        print('Error encoding ' + feature)
df.head()

Unnamed: 0,color,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,1,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,936.0,7.9,1.78,33000
1,1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,5000.0,7.1,2.35,0
2,1,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,393.0,6.8,2.35,85000
3,1,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,23000.0,8.5,2.35,164000
5,1,462.0,132.0,475.0,530.0,640.0,73058679.0,212204,1873,1.0,738.0,263700000.0,632.0,6.6,2.35,24000


In [10]:
X=df
y=round(X['imdb_score'])
#y.apply(np.round)
X = X.drop(['imdb_score'], axis = 1)
scaler=StandardScaler()
X = scaler.fit_transform(X)
y = np.array(y).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state =1)

In [11]:
clf = svm.SVC()
clf.fit(X_train, y_train)  
clf.predict(X_test)


prediction = clf.predict(X_test)
print(accuracy_score(y_test,prediction))

0.5


In [20]:
# Create a random subsample from the dataset with replacement
def subsample(X_train,y_train,ratio):
    X_sample = list()
    y_sample = list()
    n_sample = round(len(X_train) * ratio)

    while len(X_sample) < n_sample:
        index = randrange(len(X_train))
        X_sample.append(X_train[index])
        y_sample.append(y_train[index])
    X_sample_np = np.asarray(X_sample)
    y_sample_np = np.asarray(y_sample) 
    return (X_sample_np,y_sample_np)
 
# Make a prediction with a list of bagged trees
def bagging_predict_soft_voting(models, row):
    #Lets find out what each model predicts 
    pred = list()
    for i in range(len(models)):
        pred.append(models[i].predict_proba(row.reshape(1,-1)))
    finalprob=np.zeros((5,))
    for i in range(len(pred)):
        finalprob=finalprob+pred[i][0]
    final_class = finalprob.argmax(axis=-1) 
    return final_class
 
# Bootstrap Aggregation Algorithm
def bagging(X_train,y_train,X_test,sample_size,n_estimators):
    models = list()
    for i in range(n_estimators):
        X_sample_np,y_sample_np = subsample(X_train,y_train,sample_size)
        model = svm.SVC(probability=True)
        model.fit(X_sample_np, y_sample_np)  
        models.append(model)
    predictions = [bagging_predict_soft_voting(models, row) for row in X_test]
    return(predictions)

In [21]:
predictions=bagging(X_train,y_train,X_test,0.7,1)

total=np.sum([y_test[i]==(predictions[i]+4) for i in range(len(predictions))])

print("Accuracy:",total,"/",len(predictions),"* 100 =","{0:.3f}".format(total/len(predictions)*100),"%")

Accuracy: 559 / 1118 * 100 = 50.000 %


In [22]:
predictions=bagging(X_train,y_train,X_test,0.7,2)

total=np.sum([y_test[i]==(predictions[i]+4) for i in range(len(predictions))])

print("Accuracy:",total,"/",len(predictions),"* 100 =","{0:.3f}".format(total/len(predictions)*100),"%")

Accuracy: 552 / 1118 * 100 = 49.374 %


In [23]:
predictions=bagging(X_train,y_train,X_test,0.7,5)

total=np.sum([y_test[i]==(predictions[i]+4) for i in range(len(predictions))])

print("Accuracy:",total,"/",len(predictions),"* 100 =","{0:.3f}".format(total/len(predictions)*100),"%")

Accuracy: 565 / 1118 * 100 = 50.537 %


In [24]:
predictions=bagging(X_train,y_train,X_test,0.7,10)

total=np.sum([y_test[i]==(predictions[i]+4) for i in range(len(predictions))])

print("Accuracy:",total,"/",len(predictions),"* 100 =","{0:.3f}".format(total/len(predictions)*100),"%")

Accuracy: 567 / 1118 * 100 = 50.716 %


In [25]:
predictions=bagging(X_train,y_train,X_test,0.7,15)

total=np.sum([y_test[i]==(predictions[i]+4) for i in range(len(predictions))])

print("Accuracy:",total,"/",len(predictions),"* 100 =","{0:.3f}".format(total/len(predictions)*100),"%")

Accuracy: 560 / 1118 * 100 = 50.089 %


In [19]:
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(svm.SVC(),n_estimators=20,random_state=1)
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.5044722719141324