In [1]:
from tqdm import tqdm
from os import listdir
from os.path import join, exists
import pandas as pd

# input
train_docs = join('data', 'essays', 'train', 'tokenized')
dev_docs = join('data', 'essays', 'dev', 'tokenized')
train_labels = join('data', 'labels', 'train', 'labels.train.csv')
dev_labels = join('data', 'labels', 'dev', 'labels.dev.csv')

# output
train_csv_name = 'train_features.csv'
dev_csv_name = 'dev_features.csv'

def _load_docs(dir_name):
    print(dir_name)
    
    docs = []
    ids = []
    for fn in tqdm(listdir(dir_name)):
        if '.txt' in fn:
            ids.append(fn.split('.')[0])
            with open(join(dir_name, fn)) as f:
                docs.append(f.read())
        
    docs = pd.Series(docs, ids)
    docs.sort_index(inplace=True)
    return docs
    
    
def load_docs(orig_dir, out_name):
    out_path = join(orig_dir, out_name)
    if exists(out_path):
        return pd.read_csv(out_path, encoding='utf-8', low_memory=False, index_col=0)
    else:
        docs = _load_docs(orig_dir)
        data = pd.DataFrame({ORIG: docs})
        data.to_csv(out_path, encoding='utf-8')
        return data
    
    
train_data = load_docs(train_docs, train_csv_name)['original']
dev_data = load_docs(dev_docs, dev_csv_name)['original']

In [2]:
from numpy import array, concatenate

def load_labels(csv_name):
    df = pd.read_csv(csv_name)
    df.index = df['test_taker_id']
    labels = df['L1']
    labels.sort_index(inplace=True)
    return array(labels, dtype=pd.Series)


y_train = load_labels(train_labels)
y_dev = load_labels(dev_labels)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, issparse
from sklearn.externals.joblib import Memory
mem = Memory('./mycache')

tokenize = lambda x: x.split()
word_vect = TfidfVectorizer(tokenizer=tokenize, stop_words=None, ngram_range=(1, 3), binary=True)
char_vect = TfidfVectorizer(analyzer='char_wb', ngram_range=(4, 5), binary=True)

WORD = 'word'
CHAR = 'character'

@mem.cache
def get_X(data, features, mode):
    Xs = []
    for feat in features:
        if feat == WORD:
            vect = word_vect
        elif feat == CHAR:
            vect = char_vect
        
        if mode == 'train':
            X = vect.fit_transform(data)
        else:
            X = vect.transform(data)
        
        if not issparse(X):
            X = csr_matrix(X)
            
        Xs.append(X)
    
    return hstack(tuple(Xs))


X_train = get_X(train_data, [WORD, CHAR], 'train')
X_dev = get_X(dev_data, [WORD, CHAR], 'dev')

________________________________________________________________________________
[Memory] Calling __main__-C%3A-Users-sophia-Desktop-school-CSC 485E-project-NLI_Project_2017-nli-shared-task-2017-__ipython-input__.get_X...
get_X(1        Knowledge helps an inidvidual in making his ca...
3        Traveling around the world and discovering new...
5        In my opinion the best way to travel is in a g...
6        I disagree that in twenty years there will be ...
7        I strongly agree that successful people try ne...
8        On The Premium Art of Risk Taking\n\nSuccess i...
9        IDEAS AND FACTS\n\nI desagree with the stateme...
10       I agree with the statement .\nKnowing facts is...
11       The statement `` in twenty years there will be...
13       Today , most young people are very busy about ...
14       It is an increasingly busy life that we all , ...
15       In my opinion , I agree with that it is mo..., 
['word', 'character'], 'train')


In [None]:
# layer 1
import time
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier

start = time.time()
svm = LinearSVC()
svm_bagging = BaggingClassifier(base_estimator=svm, n_estimators=10, bootstrap_features=True)
svm_bagging.fit(X_train, y_train)
time.time()-start

In [None]:
# pickle the svm bagging classifier
from sklearn.externals import joblib

joblib.dump(svm_bagging, 'svm_bagging.pkl')

In [None]:
# still layer 1 
from sklearn.naive_bayes import BernoulliNB

start = time.time()
nb = BernoulliNB()
nb_bagging = BaggingClassifier(base_estimator=nb, n_estimators=10, bootstrap_features=True)
nb_bagging.fit(X_train, y_train)
time.time()-start

In [None]:
def encode(num):
    arr = np.zeros(11)
    arr[num] = 1
    return arr


def get_X_meta(X, mode='train'):
    meta = []
    for clf in tqdm(svm_bagging.estimators_ + nb_bagging.estimators_):
        predictions = clf.predict(X)
        meta.append(predictions)
        
    meta = array(meta).transpose()
    X_meta = []
    
    for row in meta:
        if len(set(row)) != 1:
            print('different predictions!')
            
        row = [encode(x) for x in row]
        X_meta.append(np.concatenate(row))
        
    return X_meta
    
    
X_train_meta = get_X_meta(X_train, mode='train')
X_dev_meta = get_X_meta(X_dev, mode='dev')

In [38]:
# layer 2
from sklearn.linear_model import Ridge


lda = LinearDiscriminantAnalysis()
svm = LinearSVC()
ridge = Ridge()

probs = []

# for ensemble fusion, use mean probability rule 
for clf in [lda, svm, ridge]:
    clf.fit(X_train_meta, y_train)
    prediction = clf.predict_proba(X_dev_meta)
    probs.append(prediction)
    
    
# Experiment with different voting schemes

<1100x55 sparse matrix of type '<class 'numpy.float64'>'
	with 5500 stored elements in Compressed Sparse Row format>