In [1]:
import preprocessing
from ngramGenerator import *
from featureIdentifier import *

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


# Validation libraries
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve
from sklearn.model_selection import cross_val_score


#Bagging
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

#Naive bayes
from sklearn.naive_bayes import GaussianNB 

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from pandas import DataFrame


In [2]:

def main():
    articles, train_labels_set,  test_labels_set = [], set(), set()

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' Pre-processing                                                   '''
    ''' (1) Load data and split data into train/test sets                '''
    ''' (2) Hashset the labels and remove labels on the data             '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    # add all files' data into articles
    preprocessing.read_data(articles)

    # split data to train and test sets
    train_set, test_set = preprocessing.data_split(articles)
    train_label_count, test_label_count = 0, 0

    # take off label and add names to labels
    for i in range(len(train_set)):
        train_set[i], train_label_count, train_labels_set =\
            preprocessing.label_extraction_takeoff(paragraphs=train_set[i], count=train_label_count, labels=train_labels_set)

    for i in range(len(test_set)):
        test_set[i], test_label_count, test_labels_set =\
            preprocessing.label_extraction_takeoff(paragraphs=test_set[i], count=test_label_count, labels=test_labels_set)

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' N-gram generation                                                '''
    ''' (1) Generate all n-gram (with first feature whether contains 's) '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    

    train_ngram_result, test_ngram_result, train_single_gram2 = [], [], []
    train_single_gram, test_single_gram, test_single_gram2 = [], [], []        # save single ones in order for later use

    for i in range(len(train_set)):
        ngrams, singles, singles2 = generate_ngrams(filename=train_set[i][0], content=train_set[i][1], n=5)
        train_ngram_result.append(ngrams)
        train_single_gram.append(singles)
        train_single_gram2.append(singles2)

    for i in range(len(test_set)):
        ngrams, singles, singles2 = generate_ngrams(filename=test_set[i][0], content=test_set[i][1], n=5)
        test_ngram_result.append(ngrams)
        test_single_gram.append(singles)
        test_single_gram2.append(singles2)

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' Take out n-gram with only lowercase (only for training data)     '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    for index in range(len(train_ngram_result)):
        train_ngram_result[index] = eliminate_all_lower(train_ngram_result[index])

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' Create a test ngram result without n-gram has only lowercase     '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    test_ngram_result_without_all_lower = test_ngram_result[:]
    for index in range(len(test_ngram_result_without_all_lower)):
        test_ngram_result_without_all_lower[index] = eliminate_all_lower(test_ngram_result_without_all_lower[index])

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' Feature creation                                                 '''
    ''' (1) 's (added during generation of ngram)                        '''
    ''' (2) country                                                      '''
    ''' (3) conjunction                                                  '''
    ''' (4) all capitalised                                              '''
    ''' (5) prefix before n-gram                                         '''
    ''' (6) verbs for humans                                             '''
    ''' (7) prefix in n-gram                                             '''
    ''' (8) after preposition                                            '''
    ''' (9) contains organization                                        '''
    ''' (10) comma before n-gram                                         '''
    ''' (11) start of a sentence                                         '''
    ''' (12) has no more than 1 word without capitalised starting letter '''
    ''' (13) contains month                                              '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    country_set, conjunction_set, prefix_set, verb_set , preposition_set, organ_set, month_set= \
        load_country_file(), load_conjunction_file(), load_prefix_library(), load_verb_file(), load_preposition_file(), load_organ_library(),load_month_file()

    for ngram_set_index in range(len(train_ngram_result)):
        for ngram_index in range(len(train_ngram_result[ngram_set_index])):
            ngram = train_ngram_result[ngram_set_index][ngram_index]

            train_ngram_result[ngram_set_index][ngram_index] = ngram +\
                (contains_country(ngram=ngram, country_set=country_set),
                 contains_conjunction(ngram=ngram, conjunctions_set=conjunction_set),
                 is_all_upper(ngram=ngram),
                 has_prefix_before_ngram(ngram=ngram, single_grams=train_single_gram[ngram_set_index], prefix_set=prefix_set),
                 has_human_verb(ngram=ngram, single_grams=train_single_gram[ngram_set_index], verb_set=verb_set),
                 contains_prefix(ngram=ngram, prefix_set=prefix_set),
                 afterpreposition(ngram=ngram, single_grams=train_single_gram[ngram_set_index], preposition_set=preposition_set),
                 contains_organization(ngram=ngram, organ_set=organ_set),
                 has_comma_before_ngram(ngram=ngram, single_grams2=train_single_gram2[ngram_set_index]),
                 has_fullstop_before_ngram(ngram=ngram, single_grams2=train_single_gram2[ngram_set_index]),
                 has_duplicate(ngram=ngram),
                 count_occurrences(ngram=ngram, single_grams=train_single_gram[ngram_set_index]),
                 no_more_than_one_lower(ngram=ngram),
                 contains_month(ngram=ngram, month_set=month_set),)

    for ngram_set_index in range(len(test_ngram_result_without_all_lower)):
        for ngram_index in range(len(test_ngram_result_without_all_lower[ngram_set_index])):
            ngram = test_ngram_result_without_all_lower[ngram_set_index][ngram_index]

            test_ngram_result_without_all_lower[ngram_set_index][ngram_index] = ngram +\
                (contains_country(ngram=ngram, country_set=country_set),
                 contains_conjunction(ngram=ngram, conjunctions_set=conjunction_set),
                 is_all_upper(ngram=ngram),
                 has_prefix_before_ngram(ngram=ngram, single_grams=test_single_gram[ngram_set_index], prefix_set=prefix_set),
                 has_human_verb(ngram=ngram, single_grams=test_single_gram[ngram_set_index], verb_set=verb_set),
                 contains_prefix(ngram=ngram, prefix_set=prefix_set),
                 afterpreposition(ngram=ngram, single_grams=test_single_gram[ngram_set_index], preposition_set=preposition_set),
                 contains_organization(ngram=ngram, organ_set=organ_set),
                 has_comma_before_ngram(ngram=ngram, single_grams2=test_single_gram2[ngram_set_index]),
                 has_fullstop_before_ngram(ngram=ngram, single_grams2=test_single_gram2[ngram_set_index]),
                 has_duplicate(ngram=ngram),
                 count_occurrences(ngram=ngram, single_grams=test_single_gram[ngram_set_index]),
                 no_more_than_one_lower(ngram=ngram),
                 contains_month(ngram=ngram, month_set=month_set),)


                                
    return train_ngram_result,test_ngram_result_without_all_lower,train_labels_set,test_labels_set

'''
    new_train = [ngram[4:] for ngram in train_ngram_result[0]]
    label = [1 if ngram[0] in labels_set else 0 for ngram in train_ngram_result[0]]
    tree = build_decision_tree(new_train, label)
    print sum([1 if a == b else 0 for a, b in zip(tree.predict(new_train), label)])
    print len(label)
'''



'\n    new_train = [ngram[4:] for ngram in train_ngram_result[0]]\n    label = [1 if ngram[0] in labels_set else 0 for ngram in train_ngram_result[0]]\n    tree = build_decision_tree(new_train, label)\n    print sum([1 if a == b else 0 for a, b in zip(tree.predict(new_train), label)])\n    print len(label)\n'

In [3]:
if __name__ == "__main__":
    train_ngram_result,test_ngram_result_without_all_lower,train_labels_set, test_labels_set= main()


In [4]:

df=pd.DataFrame(test_ngram_result_without_all_lower[0],columns=['elements','file','loc_start','loc_end','end_\'s','country','conjunction','capitalised','prefix','verb','prefix_in','preposition','organ','comma','fullstop','duplicate','count','no_more','month'])
true_label=[]
for i in range(len(df['elements'])):
    if list(df['elements'])[i] in test_labels_set:
        true_label.append(1)
    else:
        true_label.append(0)
df['true_label']=true_label


In [5]:
for i in range(1,len(test_ngram_result_without_all_lower)):
    df_temp = pd.DataFrame(test_ngram_result_without_all_lower[i],columns=['elements','file','loc_start','loc_end','end_\'s','country','conjunction','capitalised','prefix','verb','prefix_in','preposition','organ','comma','fullstop','duplicate','count','no_more','month'])
    true_label=[]
    for i in range(len(df_temp['elements'])):
        if list(df_temp['elements'])[i] in test_labels_set:
            true_label.append(1)
        else:
            true_label.append(0)
    df_temp['true_label']=true_label
    df = df.append(df_temp, ignore_index=True)

## random forest

In [6]:
test_df=df

In [7]:
length=[]
for i in range(len(test_df)):
    length.append(int(test_df['loc_end'][i]) - int(test_df['loc_start'][i])+1)
    pattern1 = re.compile('[0-9]+')
    match1 = pattern1.findall(df['elements'][i])
    if match1:
        test_df.iloc[i,:]='NULL'
    pattern2 = re.compile('-')
    match2 = pattern2.findall(test_df['elements'][i])
    if match2:
        test_df.iloc[i,:]='NULL'


In [8]:
test_df['length']=length
test_df=test_df.loc[df['elements']!='NULL',:]

In [9]:
test_df=test_df[['elements','file','loc_start','loc_end','length','end_\'s','country','conjunction','capitalised','prefix','verb','prefix_in','preposition','organ','comma','fullstop','duplicate','count','no_more','month','true_label']]

In [10]:
test_df

Unnamed: 0,elements,file,loc_start,loc_end,length,end_'s,country,conjunction,capitalised,prefix,...,prefix_in,preposition,organ,comma,fullstop,duplicate,count,no_more,month,true_label
0,John Locke,276,0,1,2,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,1
1,John Locke was,276,0,2,3,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,John Locke was an,276,0,3,4,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,John Locke was an English,276,0,4,5,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Locke was,276,1,2,2,0,0,0,0,0,...,0,0,0,0,0,0,2,1,0,0
5,Locke was an,276,1,3,3,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,Locke was an English,276,1,4,4,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,Locke was an English philosopher,276,1,5,5,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,was an English,276,2,4,3,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,was an English philosopher,276,2,5,4,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [11]:
X_test = test_df.iloc[:,4:20].values.astype('int')
y_test = test_df.iloc[:,20].values.astype('int')

In [12]:
X_test.shape

(26693, 16)

In [13]:
y_test.shape

(26693,)

In [14]:
from pandas import DataFrame
DataFrame.to_csv(test_df,"test.csv",index=False)