In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score, f1_score

from utils import *

# Training

In [2]:
fold_list = [1,2,3,4,5,6,7,8,9,10]
#input_list = ['article','encyclopedia','news','novel']
input_list = ['article']

for fold in range(1,11,1):
    print(fold)
    train_fold = fold_list.copy()
    train_fold.remove(fold)
    
    df_train = pd.DataFrame(columns = ['char','type','target'])
    df_test = pd.DataFrame(columns = ['char','type','target'])
  
    for input_folder in input_list:
        for number in train_fold:
            df1 = pd.read_csv('../cleaned_data/df_best_{}_{}.csv'.format(input_folder, number), usecols = ['char','type','target'])
            df_train = df_train.append(df1)
        df2 = pd.read_csv('../cleaned_data/df_best_{}_{}.csv'.format(input_folder, fold), usecols = ['char','type','target'])
        df_test = df_test.append(df2)
    
    df_train.reset_index(inplace=True, drop=True)
         
    df_train = add_padding(df_train, 11)
    df_test = add_padding(df_test, 11)
    
    df_n_gram_train = create_n_gram_df(df_train, number = 11)
    df_n_gram_test = create_n_gram_df(df_test, number = 11)
    
    X_train = pd.get_dummies(df_n_gram_train.drop(['target'], axis=1), sparse=True)
    y_train = df_n_gram_train.target.astype(int)

    X_test = pd.get_dummies(df_n_gram_test.drop(['target'], axis=1))
    X_test = X_test.reindex(columns = X_train.columns, fill_value=0)
    y_test = df_n_gram_test.target.astype(int)
    
    #sX_train = sp.sparse.csr_matrix(X_train.values)
    sX_test = sp.sparse.csr_matrix(X_test.values)
    
    print('training...')
    
    clf = RandomForestClassifier(class_weight="balanced")
    clf.fit(sp.sparse.csr_matrix(X_train), y_train)

    y_predict = clf.predict(sX_test)
    
    print('f1 score: ', f1_score(y_test, y_predict, average='binary'))
    print('precision score: ', precision_score(y_test, y_predict))
    print('recall score: ', recall_score(y_test, y_predict))
    
    del df_train,df_test,df_n_gram_train,df_n_gram_test,X_train,X_test,sX_test
    

1
training...
f1 score:  0.982952369854
precision score:  0.978399741493
recall score:  0.98754756429
2
training...
f1 score:  0.98548799493
precision score:  0.983366398392
recall score:  0.987618765879
3
training...
f1 score:  0.98578618677
precision score:  0.983271598587
recall score:  0.988313669387
4
training...
f1 score:  0.984390615164
precision score:  0.98003030861
recall score:  0.988789894472
5
training...
f1 score:  0.985496206871
precision score:  0.983514395071
recall score:  0.98748602162
6
training...
f1 score:  0.983669098744
precision score:  0.981308007024
recall score:  0.986041579752
7
training...
f1 score:  0.986571336908
precision score:  0.984826071706
recall score:  0.988322798855
8
training...
f1 score:  0.98342654912
precision score:  0.977780221334
recall score:  0.989138466672
9
training...
f1 score:  0.98602845943
precision score:  0.984795351796
recall score:  0.987264658998
10
training...
f1 score:  0.98318722828
precision score:  0.980664102765
recall 