In [1]:
import pickle
import pandas as pd
from scripts.Embedder import Embedder
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gradlab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
with open('first_dig_tfidf_clfs.pkl', 'rb') as f:
    clf1 = pickle.load(f)
    
with open('second_third_fourth_dig_tfidf_clfs.pkl', 'rb') as f2:
    clf2 = pickle.load(f2)

In [3]:
tfidf_test_set = pd.DataFrame(
    pd.read_csv('./Data/tfidf_test_set.csv')
)
tfidf_train_set = pd.DataFrame(
    pd.read_csv('./Data/tfidf_train_set.csv')
)

In [4]:
TRIAL_NAME = 'trial_11'

doc2vec_params = dict(
epochs = 6144, # training cycles
vec_size = 64, # specific to doc2vec, size of the output vector
alpha = 0.001, # learning rate
window = 3,
min_count = 2,
min_alpha = 0.00025
)

embedder = Embedder(
    d2v_trial_name=TRIAL_NAME,
    d2v_params=doc2vec_params,
    train_data = tfidf_train_set,
    corpus_column = 'input',
    infer_params = {
        'steps':2048,
        'alpha':0.03
    }
)

Defaulted doc2vec param: dm=1


In [5]:
embedder.train_tfidf()

TF-IDF training vector shape (37745, 2633)


<37745x2633 sparse matrix of type '<class 'numpy.float64'>'
	with 108442 stored elements in Compressed Sparse Row format>

In [6]:
tfidf_test_vectors = Embedder.get_tfidf_embeddings(embedder, tfidf_test_set['input'])

In [7]:
from sklearn.metrics import accuracy_score, f1_score
tfidf_test_df = pd.DataFrame({
    'svm_pred':clf1['SVM'].predict(tfidf_test_vectors),
    'rf_pred':clf1['RF'].predict(tfidf_test_vectors),
    'knn_pred':clf1['KNN'].predict(tfidf_test_vectors),
    'code':tfidf_test_set['code'].astype(str)
})

In [8]:
tfidf_test_df['code']=tfidf_test_set['code'].astype(str)

In [9]:
from collections import Counter
def ensemble_predict(row, predictor_cols, default_predictor):

    # find majority vote for all methods, :-1 drops ground truth column
    votes = Counter(row[predictor_cols]).most_common(1)
    
    # take svm as tie-breaker because CURRENTLY most accurate
    winning_class, highest_num_votes = votes[0]
    return row[default_predictor] if highest_num_votes < 2 else winning_class

In [10]:
tfidf_test_df.head(10)

Unnamed: 0,svm_pred,rf_pred,knn_pred,code
0,1,1,1,1221
1,0,0,1,423
2,2,9,7,7291
3,0,0,0,821
4,4,4,4,4032
5,4,4,4,4166
6,6,9,6,8222
7,3,0,3,311
8,7,9,7,1522
9,4,9,1,412


In [11]:
tfidf_test_df['p_all'] = tfidf_test_df.apply(ensemble_predict, axis = 1, args = (
    ['rf_pred','svm_pred','knn_pred'], 'svm_pred',
))

In [12]:
tfidf_test_df['vectors'] = tfidf_test_vectors.toarray().tolist()

In [13]:
def pipeline(row):
    np_array = np.array(row['vectors']).reshape(1, -1)
    p_1 = row['p_all']
    row['svm_pred_234'] = clf2[p_1]['SVM'].predict(np_array)[0]
    row['rf_pred_234'] = clf2[p_1]['RF'].predict(np_array)[0]
    row['knn_pred_234'] = clf2[p_1]['KNN'].predict(np_array)[0]
    return row

In [14]:
import time
sample_size = 100
sample_pipeline_df = tfidf_test_df.sample(sample_size)

In [15]:
start_time = time.time()
sample_pipeline_df = sample_pipeline_df.apply(pipeline, axis = 1)
print('time to predict on {} samples: {} seconds'.format(sample_size, np.round(time.time()-start_time, 2)))

time to predict on 100 samples: 23.62 seconds


In [16]:
sample_pipeline_df['p_all_234'] = sample_pipeline_df.apply(ensemble_predict, axis = 1, args = (
    ['svm_pred_234','rf_pred_234','knn_pred_234'], 'knn_pred_234',
))

In [17]:
from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        sample_pipeline_df['{}_pred_234'.format(classifier)].astype(int), 
                                        sample_pipeline_df['code'].astype(int)
                                    ),
                                    f1_score(
                                        sample_pipeline_df['{}_pred_234'.format(classifier)].astype(int),
                                        sample_pipeline_df['code'].astype(int), average = 'macro')
                                   )
     )
print('Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
    accuracy_score(sample_pipeline_df['p_all_234'].astype(int), sample_pipeline_df['code'].astype(int)), 
    f1_score(sample_pipeline_df['p_all_234'].astype(int), sample_pipeline_df['code'].astype(int), average = 'macro')
))

KNN acc:0.64, f1-macro:0.4832798287854468
SVM acc:0.64, f1-macro:0.48161802355350736
RF acc:0.65, f1-macro:0.4881493506493506
Ensemble metrics total. Accuracy:0.67, f1:0.5097402597402598



In [18]:
sample_pipeline_df

Unnamed: 0,svm_pred,rf_pred,knn_pred,code,p_all,vectors,svm_pred_234,rf_pred_234,knn_pred_234,p_all_234
7626,2,2,2,2145,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2142,2131,2134,2134
17060,6,6,6,6232,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6232,6232,6232,6232
21816,4,4,4,4153,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4153,4153,4153,4153
17940,2,9,2,2261,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2261,2261,2261,2261
23759,7,7,7,7241,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7241,7241,7241,7241
...,...,...,...,...,...,...,...,...,...,...
27420,3,3,3,3012,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3012,3012,3012,3012
27083,6,9,0,6622,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6622,6421,6421,6421
22414,7,7,7,7312,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7312,7312,7312,7312
21990,4,4,4,4032,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4032,4032,4032,4032
