## Preprocessing
Text already converted to TFIDF

In [1]:
%matplotlib notebook
import numpy as np
import scipy
import sklearn
import matplotlib.pyplot as plt

In [2]:
with np.load('dataMau.npz') as loaded:
    # action/adv = r | rom/com = b | drama/crime = 'g' | bio/doc = 'y'
    colors = loaded['colors']
    y = loaded['y']
    # tfidf vectors on title
    tfidf = loaded['Xtfidf']
    # imdbid, year, runtime, score
    extra = loaded['otherData']

In [3]:
# Put runtime, score, and title TFIDF into data MTX.
X = np.zeros([tfidf.shape[0], tfidf.shape[1] + 2])
X[:,0:2] = extra[:, 2:]
X[:, 2:] = tfidf

In [4]:
# generate random partition of data
np.random.seed(1)
testIdx = np.random.choice(y.shape[0], 200, replace=False)

## Train an SVM on raw vectorized data

In [5]:
y_test = y[testIdx]
y_train = np.delete(y, testIdx)
X_test = X[testIdx, :]
X_train = np.delete(X, testIdx, axis=0)

In [6]:
# train an SVM classifier
from sklearn import svm
clf = svm.LinearSVC(penalty='l2', dual=True, multi_class='ovr', max_iter=100000)
clf.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [8]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

y_pred = clf.predict(X_test)
a = confusion_matrix(y_test, y_pred)
print("Misflassification error %f" %((a.sum() - a.trace()) / a.sum()))

Misflassification error 0.405000


## Train SVM on standardized data

In [9]:
# Next, we standardize our data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
stdX = scaler.fit_transform(X)

stdX_test = stdX[testIdx, :]
stdX_train = np.delete(stdX, testIdx, axis=0)

In [16]:
stdClf = svm.LinearSVC(penalty='l2', dual=False, multi_class='ovr', max_iter=10000)
stdClf.fit(stdX_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [17]:
y_stdpred = stdClf.predict(stdX_test)
a = confusion_matrix(y_test, y_stdpred)
print("Misflassification error %f" %((a.sum() - a.trace()) / a.sum()))

Misflassification error 0.475000


## Feature Selection

In [21]:
# do a little feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif 
rX = SelectKBest(mutual_info_classif, k=1000).fit_transform(stdX, y)
rX_test = rX[testIdx, :]
rX_train = np.delete(rX, testIdx, axis=0)

In [22]:
otherCLF = svm.LinearSVC(penalty='l2', dual=False, fit_intercept=False, multi_class='ovr', max_iter=10000)
otherCLF.fit(rX_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=False,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [23]:
y_rpred = otherCLF.predict(rX_test)
a2 = confusion_matrix(y_test, y_rpred)
print("Misflassification error %f" %((a2.sum() - a2.trace()) / a2.sum()))

Misflassification error 0.545000


In [24]:
from concurrent.futures import ThreadPoolExecutor as PoolExecutor