In [2]:
import numpy as np
import pandas as pd
TRAIN_CSV_NAME = 'train_data.csv'
TEST_CSV_NAME = 'test_data.csv'
TRAIN_IMG_DIRECTORY = 'train_posters'
TEST_IMG_DIRECTORY = 'test_posters'
TEST_SIZE = 200
SEED = 1
np.random.seed(SEED) # Seed our randomness for reproducibilty

#First load and preprocess the TFIDF model.

In [3]:
from bg.build_tfidf import preprocess_nonimage_data
Xtfidf, Xother, y, vectorizer = preprocess_nonimage_data(TRAIN_CSV_NAME)
# Choose indices for our test data
testIdx = np.random.choice(y.shape[0], TEST_SIZE, replace=False)
X = np.zeros([Xtfidf.shape[0], Xtfidf.shape[1] + 2])
X[:,0:2], X[:, 2:]  = Xother[:, 2:], Xtfidf

X_test = X[testIdx, :]
X_train = np.delete(X, testIdx, axis=0)
y_test = y[testIdx]
y_train = np.delete(y, testIdx)



Now let's train an SVM on the TFIDF data vectors.

In [10]:
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
clf = CalibratedClassifierCV(svm.LinearSVC(penalty='l2', dual=True, multi_class='ovr', max_iter=100000, class_weight='balanced'),
                                        cv=5)
y_predTF = cross_val_predict(clf, X_train, y_train, cv=5)
conf_mat = confusion_matrix(y_train, y_predTF)
print(conf_mat)
scoresTF = cross_val_score(clf, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f) (95 percent confidence)" % (scores.mean(), scores.std() * 2))
clf.fit(X_train, y_train)

[[282  77 155  98]
 [ 64 250 108 135]
 [108 122 248 281]
 [ 41 105  54 766]]


Accuracy: 0.52 (+/- 0.01) (95 percent confidence)


CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=5, method='sigmoid')

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)
a = confusion_matrix(y_test, y_pred)
print("Classification rate %f" %(a.trace() / a.sum()))

Classification rate 0.585000


#Now load and preprocess the color histogram model.

In [4]:
from bg.util import build_histograms, preprocess_images, extract_texture_features
from sklearn.ensemble import RandomForestClassifier
train_data_csv = pd.read_csv(TRAIN_CSV_NAME).values
genres = np.array(train_data_csv[:,-1]).astype(int)
test_data_csv = pd.read_csv(TEST_CSV_NAME).values
preprocessed_train = preprocess_images(train_data_csv, TRAIN_IMG_DIRECTORY)
histogram_processed_train = build_histograms(preprocessed_train, 12)
# texture_processed_train = np.array(list(map(extract_texture_features, preprocessed_train)))

bgX_test = histogram_processed_train[testIdx]
bgX_train = np.delete(histogram_processed_train, testIdx, axis=0)

# bgX2_test = texture_processed_train[testIdx]
# bgX2_train = np.delete(texture_processed_train, testIdx, axis=0)

bgy_test = genres[testIdx]
bgy_train = np.delete(genres, testIdx)



And now fit/test it

In [9]:

from sklearn.model_selection import cross_val_score
bg_model = RandomForestClassifier(n_estimators=400, random_state=SEED)
y_pred = cross_val_predict(bg_model, bgX_train, bgy_train, cv=5)
conf_mat = confusion_matrix(bgy_train, y_pred)
print(conf_mat)
scores = cross_val_score(bg_model, bgX_train, bgy_train)
print("Accuracy: %0.2f (+/- %0.2f) (95 percent confidence)" % (scores.mean(), scores.std() * 2))
bg_model.fit(bgX_train, bgy_train)
print(bg_model.score(bgX_test, bgy_test))

[[267  54 146 145]
 [ 39 353  47 118]
 [125  77 273 284]
 [ 82 113 154 617]]


0.47


# Finally let's try an ensemble model

In [None]:
from bg.util import trainModelStack, predictModelStack
from sklearn.linear_model import LogisticRegressionCV
combiner = LogisticRegressionCV()  # Standard Logistic classifier. Worth trying some other things.
models = [clf, bg_model]
datas = [X_train, bgX_train]
trainModelStack(models, combiner, datas, y_train)
predictions = predictModelStack(models, combiner, [X_test, bgX_test])
cm = confusion_matrix(predictions, y_pred)
print("Classification rate %f" %(cm.trace() / cm.sum()))

Classification rate 0.650000


# Now let's make a Kaggle submission

In [None]:
# del preprocess_nonimage_data
from bg.build_tfidf import preprocess_nonimage_data
tfTest, otherTest, _, _ = preprocess_nonimage_data(TEST_CSV_NAME, True, csvSize=343, vectorizer=vectorizer)
print(tfTest.shape)
testXTf = np.zeros([tfTest.shape[0], tfTest.shape[1] + 2])
testXTf[:,0:2], testXTf[:, 2:]  = otherTest[:, 2:], tfTest
preprocessed_test = preprocess_images(test_data_csv, TEST_IMG_DIRECTORY)
histogram_processed_test = build_histograms(preprocessed_test, 12)
texture_processed_test = np.array(list(map(extract_texture_features, preprocessed_test)))

print(tfTest.shape, X_train.shape)
print(histogram_processed_test.shape)
print(testXTf.shape)
# test_predictions_tfidf = clf.predict(testXTf)
test_predictions_hist = bg_model.predict(histogram_processed_test)
test_predictions = predictModelStack(models, combiner, [testXTf, histogram_processed_test, texture_processed_test])
print(test_predictions.shape)

(343, 3841)


  .format(dtypeobj_in, dtypeobj_out))


(343, 3841) (3092, 3843)
(343, 1728)
(343, 3843)
(343,)


In [None]:
from bg.util import predictions_to_csv
predictions_to_csv(test_predictions, "stacked7.csv") # Not working for some reason so I'm doing it manually below
labels = ['Id', 'Category']
df = pd.DataFrame.from_records(enumerate(test_predictions), columns=labels)
df.to_csv('stacked7.csv', mode='w', index=False)