# SVM - classification

SHUMBUL ARIFA \
181CO152

## Task
Performing Kernel SVM on a classification dataset.
1. Linear Kernel
2. Polynomial Kernel
3. Radial Basis Function (RBF) kernel

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt

In [2]:
df = pd.read_csv("Movie_classification.csv")
df.head()

Unnamed: 0,Marketing expense,Production expense,Multiplex coverage,Budget,Movie_length,Lead_ Actor_Rating,Lead_Actress_rating,Director_rating,Producer_rating,Critic_rating,Trailer_views,3D_available,Time_taken,Twitter_hastags,Genre,Avg_age_actors,Num_multiplex,Collection,Start_Tech_Oscar
0,20.1264,59.62,0.462,36524.125,138.7,7.825,8.095,7.91,7.995,7.94,527367,YES,109.6,223.84,Thriller,23,494,48000,1
1,20.5462,69.14,0.531,35668.655,152.4,7.505,7.65,7.44,7.47,7.44,494055,NO,146.64,243.456,Drama,42,462,43200,0
2,20.5458,69.14,0.531,39912.675,134.6,7.485,7.57,7.495,7.515,7.44,547051,NO,147.88,2022.4,Comedy,38,458,69400,1
3,20.6474,59.36,0.542,38873.89,119.3,6.895,7.035,6.92,7.02,8.26,516279,YES,185.36,225.344,Drama,45,472,66800,1
4,21.381,59.36,0.542,39701.585,127.7,6.92,7.07,6.815,7.07,8.26,531448,NO,176.48,225.792,Drama,55,395,72400,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Marketing expense    506 non-null    float64
 1   Production expense   506 non-null    float64
 2   Multiplex coverage   506 non-null    float64
 3   Budget               506 non-null    float64
 4   Movie_length         506 non-null    float64
 5   Lead_ Actor_Rating   506 non-null    float64
 6   Lead_Actress_rating  506 non-null    float64
 7   Director_rating      506 non-null    float64
 8   Producer_rating      506 non-null    float64
 9   Critic_rating        506 non-null    float64
 10  Trailer_views        506 non-null    int64  
 11  3D_available         506 non-null    object 
 12  Time_taken           494 non-null    float64
 13  Twitter_hastags      506 non-null    float64
 14  Genre                506 non-null    object 
 15  Avg_age_actors       506 non-null    int

# Data Cleaning and preprocessing
1. time_taken has some missing values
2. 3D_available and Genre -> object type (string)

In [4]:
mean = df['Time_taken'].mean()
mean

157.39149797570855

In [5]:
df['Time_taken'].fillna(value = mean, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Marketing expense    506 non-null    float64
 1   Production expense   506 non-null    float64
 2   Multiplex coverage   506 non-null    float64
 3   Budget               506 non-null    float64
 4   Movie_length         506 non-null    float64
 5   Lead_ Actor_Rating   506 non-null    float64
 6   Lead_Actress_rating  506 non-null    float64
 7   Director_rating      506 non-null    float64
 8   Producer_rating      506 non-null    float64
 9   Critic_rating        506 non-null    float64
 10  Trailer_views        506 non-null    int64  
 11  3D_available         506 non-null    object 
 12  Time_taken           506 non-null    float64
 13  Twitter_hastags      506 non-null    float64
 14  Genre                506 non-null    object 
 15  Avg_age_actors       506 non-null    int

In [6]:
## 3D-available and Genre
# ### Using dummy variable creation

# df = pd.get_dummies(df, columns = ["3D_available", "Genre"])
# df.info()
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,3D_available,Genre
0,YES,Thriller
1,NO,Drama
2,NO,Comedy
3,YES,Drama
4,NO,Drama


In [7]:
## if any null value is present in those rows
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,3D_available,Genre


In [8]:
# ## if it was present in column "c"
# obj_df["c"].value_counts()
# obj_df = obj_df.fillna({"c": "NEW_NAME"})

In [9]:
## replace
cleanup_nums = {"3D_available":     {"YES": 1, "NO": 0},
                "Genre": {"Thriller": 0, "Drama": 1, "Comedy": 2, "Action": 3}}

In [10]:
## replace only once!
df = df.replace(cleanup_nums)
df.info()

## done

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Marketing expense    506 non-null    float64
 1   Production expense   506 non-null    float64
 2   Multiplex coverage   506 non-null    float64
 3   Budget               506 non-null    float64
 4   Movie_length         506 non-null    float64
 5   Lead_ Actor_Rating   506 non-null    float64
 6   Lead_Actress_rating  506 non-null    float64
 7   Director_rating      506 non-null    float64
 8   Producer_rating      506 non-null    float64
 9   Critic_rating        506 non-null    float64
 10  Trailer_views        506 non-null    int64  
 11  3D_available         506 non-null    int64  
 12  Time_taken           506 non-null    float64
 13  Twitter_hastags      506 non-null    float64
 14  Genre                506 non-null    int64  
 15  Avg_age_actors       506 non-null    int

# X_y split

In [11]:
X = df.loc[:,df.columns!="Start_Tech_Oscar"]
# All cols except collection

type(X)

pandas.core.frame.DataFrame

In [12]:
X.head()

Unnamed: 0,Marketing expense,Production expense,Multiplex coverage,Budget,Movie_length,Lead_ Actor_Rating,Lead_Actress_rating,Director_rating,Producer_rating,Critic_rating,Trailer_views,3D_available,Time_taken,Twitter_hastags,Genre,Avg_age_actors,Num_multiplex,Collection
0,20.1264,59.62,0.462,36524.125,138.7,7.825,8.095,7.91,7.995,7.94,527367,1,109.6,223.84,0,23,494,48000
1,20.5462,69.14,0.531,35668.655,152.4,7.505,7.65,7.44,7.47,7.44,494055,0,146.64,243.456,1,42,462,43200
2,20.5458,69.14,0.531,39912.675,134.6,7.485,7.57,7.495,7.515,7.44,547051,0,147.88,2022.4,2,38,458,69400
3,20.6474,59.36,0.542,38873.89,119.3,6.895,7.035,6.92,7.02,8.26,516279,1,185.36,225.344,1,45,472,66800
4,21.381,59.36,0.542,39701.585,127.7,6.92,7.07,6.815,7.07,8.26,531448,0,176.48,225.792,1,55,395,72400


In [13]:
X.shape

(506, 18)

In [14]:
y = df["Start_Tech_Oscar"]
type(y)

pandas.core.series.Series

In [15]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: Start_Tech_Oscar, dtype: int64

In [16]:
y.shape

(506,)

# Test-Train Split

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [19]:
X_train.head()

Unnamed: 0,Marketing expense,Production expense,Multiplex coverage,Budget,Movie_length,Lead_ Actor_Rating,Lead_Actress_rating,Director_rating,Producer_rating,Critic_rating,Trailer_views,3D_available,Time_taken,Twitter_hastags,Genre,Avg_age_actors,Num_multiplex,Collection
220,27.1618,67.4,0.493,38612.805,162.0,8.485,8.64,8.485,8.67,8.52,480270,0,174.68,224.272,0,23,536,53400
71,23.1752,76.62,0.587,33113.355,91.0,7.28,7.4,7.29,7.455,8.16,491978,0,200.68,263.472,3,46,400,43400
240,22.2658,64.86,0.572,38312.835,127.8,6.755,6.935,6.8,6.84,8.68,470107,1,204.8,224.32,2,24,387,54000
6,21.7658,70.74,0.476,33396.66,140.1,7.065,7.265,7.15,7.4,8.96,459241,1,139.16,243.664,0,41,522,45800
417,538.812,91.2,0.321,29463.72,162.6,9.135,9.305,9.095,9.165,6.96,302776,1,172.16,301.664,3,60,589,20800


In [20]:
y_test.head()

329    0
371    1
219    0
403    0
78     0
Name: Start_Tech_Oscar, dtype: int64

In [21]:
X_train.shape

(404, 18)

# Standardizing Data

- Coverting mean and variance close to 0 and 1, for each variable.
- SVM only gives correct result when we standardize our data!
- Ways: StandardScaler, MinMax scaler

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
sc = StandardScaler().fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [24]:
X_test_std

# here, we only need to std our X data, not y

array([[-4.08358690e-01, -1.12872913e+00,  8.33368826e-01, ...,
         1.12308956e+00, -8.87385815e-01,  1.15409837e-03],
       [ 7.19251107e-01,  9.98884403e-01, -6.52839787e-01, ...,
        -1.15123717e+00,  6.08961586e-01,  2.97217905e+00],
       [-4.02574884e-01,  3.96108293e-01,  5.11537670e-02, ...,
        -1.47614099e+00,  1.51479578e-01,  4.45267254e-02],
       ...,
       [-3.98260097e-01, -8.58124181e-01,  8.94207776e-01, ...,
        -7.45107395e-01, -1.01128719e+00, -4.21729015e-01],
       [-3.99342792e-01, -7.63765430e-02,  5.81321752e-01, ...,
        -2.93820817e+00, -9.92225442e-01,  5.97527720e-01],
       [-4.00880712e-01, -3.67026306e-01,  3.11892120e-01, ...,
         7.16959787e-01, -4.10842057e-01, -3.02454291e-01]])

All decimals, scales of values changed -> uniform scale

Now, we can perform SVM

# Performing SVM Classification

## Linear Kernel

In [25]:
from sklearn.svm import SVC ## C - classification
svc = SVC(kernel='linear', C=0.01)

In [26]:
svc.fit(X_train_std, y_train)

SVC(C=0.01, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Predict values using trained model

In [27]:
y_test_pred = svc.predict(X_test_std)
y_train_pred = svc.predict(X_train_std)

In [28]:
y_test_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## Model Performance

In [29]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [30]:
svc.score(X_test, y_test)

0.43137254901960786

In [31]:
confusion_matrix(y_test, y_test_pred)

array([[ 7, 37],
       [ 1, 57]])

In [32]:
accuracy_score(y_test, y_test_pred)

0.6274509803921569

In [33]:
svc.n_support_

## What's this??

### number of support vectors 
### for 1st class (0) and 2nd class (1) RESPECTIVELY

array([186, 191], dtype=int32)

## Grid Search

To choose the ost accurate parameter C

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
params = {'C':(0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 150, 500, 1000)}

In [36]:
svc_op = SVC(kernel='linear')

In [37]:
svm_grid_lin = GridSearchCV(svc_op, params, n_jobs=-1,
                           cv=10, verbose=1, scoring='accuracy')

In [38]:
svm_grid_lin.fit(X_train_std, y_train)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.3min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': (0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50,
                               100, 150, 500, 1000)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [39]:
svm_grid_lin.best_params_

## What's this??

### get the best parameter!!!

{'C': 0.5}

In [40]:
linsvm_clf = svm_grid_lin.best_estimator_
linsvm_clf

SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
accuracy_score(y_test, linsvm_clf.predict(X_test_std))

0.6078431372549019

## Polynomial Kernel

In [51]:
poly_svm = SVC(kernel = 'poly', degree = 0.1, C=0.1)
poly_svm.fit(X_train_std, y_train)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=0.1, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [52]:
y_train_pred = poly_svm.predict(X_train_std)
y_test_pred = poly_svm.predict(X_test_std)

In [53]:
accuracy_score(y_test, y_test_pred)

0.5686274509803921

In [54]:
poly_svm.n_support_

array([186, 186], dtype=int32)

In [55]:
params = {'C':(0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 150, 500, 1000)}
svm_grid_poly = GridSearchCV(poly_svm, params, n_jobs=-1,
                           cv=10, verbose=1, scoring='accuracy')

In [56]:
svm_grid_poly.fit(X_train_std, y_train)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    2.7s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=0.1, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=0.1,
                           gamma='scale', kernel='poly', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': (0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50,
                               100, 150, 500, 1000)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [57]:
svm_grid_poly.best_params_

{'C': 0.001}

In [58]:
polysvm_clf = svm_grid_poly.best_estimator_
polysvm_clf

SVC(C=0.001, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=0.1, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [59]:
accuracy_score(y_test, polysvm_clf.predict(X_test_std))

0.5686274509803921

## Radial Kernel

In [62]:
rad_svm = SVC(kernel = 'rbf', gamma = 0.1, C=10)
rad_svm.fit(X_train_std, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [63]:
y_train_pred = rad_svm.predict(X_train_std)
y_test_pred = rad_svm.predict(X_test_std)

In [64]:
accuracy_score(y_test, y_test_pred)

0.5294117647058824

In [65]:
rad_svm.n_support_

array([150, 171], dtype=int32)

In [67]:
params = {'C':(0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50),
         'gamma': (0.001, 0.01, 0.1, 0.5, 1)}
svm_grid_rad = GridSearchCV(rad_svm, params, n_jobs=-1,
                           cv=10, verbose=1, scoring='accuracy')

In [68]:
svm_grid_rad.fit(X_train_std, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    4.5s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=10, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3, gamma=0.1,
                           kernel='rbf', max_iter=-1, probability=False,
                           random_state=None, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': (0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,
                               50),
                         'gamma': (0.001, 0.01, 0.1, 0.5, 1)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [69]:
svm_grid_rad.best_params_

{'C': 50, 'gamma': 0.001}

In [70]:
radsvm_clf = svm_grid_rad.best_estimator_
radsvm_clf

SVC(C=50, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [72]:
accuracy_score(y_test, radsvm_clf.predict(X_test_std))

0.5980392156862745