In [30]:
import numpy as np
import pandas as pd
TRAIN_CSV_NAME = 'train_data.csv'
TEST_CSV_NAME = 'test_data.csv'
TRAIN_IMG_DIRECTORY = 'train_posters'
TEST_IMG_DIRECTORY = 'test_posters'
SEED = 11
np.random.seed(SEED) # Seed our randomness for reproducibilty

#First load and preprocess the TFIDF model.

In [31]:
from bg.build_tfidf import preprocess_nonimage_data
Xtfidf, Xother, y, vectorizer = preprocess_nonimage_data(TRAIN_CSV_NAME)
# Choose indices for our test data
testIdx = np.random.choice(y.shape[0], 200, replace=False)
X = np.zeros([Xtfidf.shape[0], Xtfidf.shape[1] + 2])
X[:,0:2], X[:, 2:]  = Xother[:, 2:], Xtfidf

X_test = X[testIdx, :]
X_train = np.delete(X, testIdx, axis=0)
y_test = y[testIdx]
y_train = np.delete(y, testIdx)



Now let's train an SVM on the TFIDF data vectors.

In [32]:
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
clf = CalibratedClassifierCV(svm.LinearSVC(penalty='l2', dual=True, multi_class='ovr', max_iter=100000),
                                        method='sigmoid',
                                        cv=3)
clf.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=300000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=3, method='sigmoid')

In [33]:
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)
a = confusion_matrix(y_test, y_pred)
print("Classification rate %f" %(a.trace() / a.sum()))

Classification rate 0.620000


#Now load and preprocess the color histogram model.

In [34]:
from bg.util import build_histograms, preprocess_images
from sklearn.ensemble import RandomForestClassifier
train_data_csv = pd.read_csv(TRAIN_CSV_NAME).values
genres = np.array(train_data_csv[:,-1]).astype(int)
test_data_csv = pd.read_csv(TEST_CSV_NAME).values
preprocessed_train = preprocess_images(train_data_csv, TRAIN_IMG_DIRECTORY)
histogram_processed_train = build_histograms(preprocessed_train, 12)

bgX_test = histogram_processed_train[testIdx]
bgX_train = np.delete(histogram_processed_train, testIdx, axis=0)
bgy_test = genres[testIdx]
bgy_train = np.delete(genres, testIdx)



And now fit/test it

In [35]:
bg_model = RandomForestClassifier(n_estimators=800, random_state=SEED)
bg_model.fit(bgX_train, bgy_train)
print(bg_model.score(bgX_test, bgy_test))
print(bg_model.score(bgX_test, y_test)) # This proves that we are using the same labels

0.495
0.495


# Finally let's try an ensemble model

In [36]:
from bg.util import trainModelStack, predictModelStack
from sklearn.linear_model import LogisticRegression
combiner = LogisticRegression() # Standard Logistic classifier. Worth trying some other things.
models = [bg_model, clf]
datas = [bgX_train, X_train]
trainModelStack(models, combiner, datas, y_train)
predictions = predictModelStack(models, combiner, [bgX_test, X_test])
cm = confusion_matrix(predictions, y_pred)
print("Classification rate %f" %(cm.trace() / cm.sum()))

Classification rate 0.710000


# Now let's make a Kaggle submission

In [37]:
# del preprocess_nonimage_data
from bg.build_tfidf import preprocess_nonimage_data
tfTest, otherTest, _, _ = preprocess_nonimage_data(TEST_CSV_NAME, True, csvSize=343, vectorizer=vectorizer)
print(tfTest.shape)
testXTf = np.zeros([tfTest.shape[0], tfTest.shape[1] + 2])
testXTf[:,0:2], testXTf[:, 2:]  = otherTest[:, 2:], tfTest
preprocessed_test = preprocess_images(test_data_csv, TEST_IMG_DIRECTORY)
histogram_processed_test = build_histograms(preprocessed_test, 12)

print(tfTest.shape, X_train.shape)
print(histogram_processed_test.shape)
print(testXTf.shape)
test_predictions_tfidf = clf.predict(testXTf)
test_predictions_hist = bg_model.predict(histogram_processed_test)
test_predictions = predictModelStack(models, combiner, [histogram_processed_test, testXTf])
print(test_predictions.shape)

(343, 3841)


(343, 3841) (2894, 3843)
(343, 1728)
(343, 3843)


(343,)


In [38]:
from bg.util import predictions_to_csv
predictions_to_csv(test_predictions, "stacked.csv") # Not working for some reason so I'm doing it manually below
labels = ['Id', 'Category']
df = pd.DataFrame.from_records(enumerate(test_predictions), columns=labels)
df.to_csv('stacked.csv', mode='w', index=False)