In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
import string
import re
import pandas as pd
import time
import os
import numpy as np
import pickle



In [3]:
# DEFINE DIRECTORY PATH
path_to_json = 'TrainingData/'

# CREATE LIST OF FILES FROM THE DIRECTORY
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

# DEFINE PANDAS DATAFRAME
df = pd.DataFrame()

# LOOP THROUGH FILES, READ IN JSON AND BUILD DATAFRAME
for index, js in enumerate(json_files):
    json_data = pd.read_json(os.path.join(path_to_json, js), 'r')
    df = df.append(json_data)
    
# LOOK AT TOPIC DICTIONARY AND GET A TOPIC COUNT
topic_file = open('./topicDictionary.txt', 'r')
topics = topic_file.read().split('\r\n')

# SPLITTING THE ELEMENTS OF THE JSON INTO TEXT, PUBLICATION DATE AND TOPICS
df['text'] = df.TrainingData.apply(lambda x: x['bodyText'])
df['pubdate'] = df.TrainingData.apply(lambda x: x['webPublicationDate'])
df['topics'] = df.TrainingData.apply(lambda x: x['topics'])

# DROP FIRST TWO COLUMNS
df.reset_index(inplace=True, drop=True)
df.drop('TrainingData', axis=1, inplace=True)

# DEFINE FUNCTION TO CREATE OUR DATAFRAME
def topic_col(x):
    a = 0
    for elem in x:
        if elem == topic:
            a = 1
    return a

# RUN TOPIC COL FUNCTION ON ALL DATA
for topic in topics:
    time1 = time.time()
    df[topic] = df['topics'].map(topic_col)
    print topic
    time2 = time.time()
    time_in_s = (time2-time1)
    print 'Function takes around %0.3f seconds to run' % (time_in_s)

activism
Function takes around 2.782 seconds to run
afghanistan
Function takes around 1.763 seconds to run
aid
Function takes around 1.217 seconds to run
algerianhostagecrisis
Function takes around 1.367 seconds to run
alqaida
Function takes around 1.213 seconds to run
alshabaab
Function takes around 1.244 seconds to run
antiwar
Function takes around 1.337 seconds to run
arabandmiddleeastprotests
Function takes around 1.425 seconds to run
armstrade
Function takes around 1.309 seconds to run
australianguncontrol
Function takes around 1.339 seconds to run
australiansecurityandcounterterrorism
Function takes around 1.518 seconds to run
bastilledaytruckattack
Function takes around 1.325 seconds to run
belgium
Function takes around 1.117 seconds to run
berlinchristmasmarketattack
Function takes around 1.308 seconds to run
bigdata
Function takes around 1.073 seconds to run
biometrics
Function takes around 1.141 seconds to run
bokoharam
Function takes around 1.158 seconds to run
bostonmaratho

undercoverpoliceandpolicing
Function takes around 1.336 seconds to run
unitednations
Function takes around 1.180 seconds to run
usguncontrol
Function takes around 1.149 seconds to run
values
Function takes around 1.072 seconds to run
warcrimes
Function takes around 1.163 seconds to run
warreporting
Function takes around 1.145 seconds to run
weaponstechnology
Function takes around 1.199 seconds to run
womeninbusiness
Function takes around 1.181 seconds to run
woolwichattack
Function takes around 1.173 seconds to run
worldmigration
Function takes around 1.169 seconds to run
zikavirus
Function takes around 1.141 seconds to run


In [4]:
# CREATE FINAL TEST DF
testdf = pd.read_json('/Users/sudheerpamula/Downloads/TestData.json', 'r')
testdf['text'] = testdf.TestData.apply(lambda x: x['bodyText'])
testdf['pubdate'] = testdf.TestData.apply(lambda x: x['webPublicationDate'])
testdf['topics'] = testdf.TestData.apply(lambda x: x['topics'])
testdf.drop('TestData', axis=1, inplace=True)

In [5]:
def sampler(topic, df):
    
    # SELECT INDICES OF TOPIC ARTICLES
    topicindexes = df[df[topic] == 1].index.tolist()
    
    # FIND COUNT OF TOPIC ARTICLES
    articlecount = len(topicindexes)
    
    if articlecount < 200:
        print articlecount
        # SELECT NON TOPIC INDICES
        nontopicarticlesindexes = df[df[topic] == 0].sample(articlecount*9).index.tolist()
        nonarticlecount = len(nontopicarticlesindexes)
        # CREATE LIST OF COMBINED INDICES
        sampleindex = topicindexes + nontopicarticlesindexes 
        # CREATE NEW DATAFRAME
        X = df.iloc[sampleindex]['text'].reset_index(drop=True)
        y = df.iloc[sampleindex][topic].reset_index(drop=True)
        X = X.append(df.iloc[topicindexes]['text'])
        y = y.append(df.iloc[topicindexes][topic])
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True) 
    else:
        # SELECT NON TOPIC INDICES
        topicindexes = df[df[topic] == 1].sample(200).index.tolist()
        
        nontopicarticlesindexes = df[df[topic] == 0].sample(len(topicindexes)*9).index.tolist()
        nonarticlecount = len(nontopicarticlesindexes)
        # CREATE LIST OF COMBINED INDICES
        sampleindex = topicindexes + nontopicarticlesindexes
        # CREATE NEW DATAFRAME
        X = df.iloc[sampleindex]['text'].reset_index(drop=True)
        y = df.iloc[sampleindex][topic].reset_index(drop=True)
    
    return X, y

In [6]:
submission = pd.DataFrame(index=testdf.index, columns=topics)

In [6]:
# OPEN VOCABULARY DICTIONARY PICKLE AND EDIT
with open('vocab.pickle', 'rb') as handle:
    modelvocab = pickle.load(handle)

In [7]:
parameters = {
    'vectorizer__max_df': (0.15, 0.25),
    'tfidf__norm': ('l2', 'l1'),
    'clf__penalty': ('l1', 'l2'),
}

clf = LogisticRegression(warm_start=True, n_jobs=-1)

In [8]:
topics_0 = ['activism', 'turkeycoupattempt', 'tunisiaattack2015', 'sanbernardinoshooting', 'peaceandreconciliation',
            'parisattacks', 'orlandoterrorattack', 'munichshooting', 'francetrainattack', 'charliehebdoattack',
            'brusselsattacks', 'berlinchristmasmarketattack','bastilledaytruckattack','zikavirus']

In [None]:
f1_scores = []
grid_search_params = []

time1 = time.time()

for topic in topics:
    if topic in topics_0:
        submission[topic] = 1
        f1_scores.append(0)
        grid_search_params.append("none")
    else:
        X, y = sampler(topic, df)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)
        
        #N-GRAM RANGE OF (1,1)
        vectorizer = CountVectorizer(ngram_range=(1,4), stop_words='english')

        # PIPELINE OF VECTORIZER, TF-IDF TRANSFORMER, STANDARD SCALER, MODEL
        pipe = Pipeline([('vectorizer', vectorizer), 
                 ('tfidf', TfidfTransformer()),
                 ('ss', StandardScaler(with_mean=False)),
                 ('clf', clf)])

        # GRIDSEARCH PIPELINE FIT
        grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)

        # PREDICT
        preds = grid_search.best_estimator_.predict(X_test)
        topicf1score = f1_score(y_test, preds)
        print "F1 score:", topicf1score
        f1_scores.append(topicf1score)
        grid_search_params.append(grid_search.best_params_)

        # FINAL PREDICTIONS
        preds_final = grid_search.best_estimator_.predict(testdf['text'])

        # WRITE TO CSV
        submission[topic] = preds_final
        
    submission.to_csv('submission_logregngram4.csv')
    print topic
    
time2 = time.time()
time_in_s = (time2-time1)
print 'Function takes around %0.3f seconds to run' % (time_in_s)

scoredf = pd.DataFrame({'f1': f1_scores, 'params': grid_search_params}, index=topics)

activism
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.8min finished


F1 score: 0.891089108911
afghanistan
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.5min finished


F1 score: 0.854368932039
aid
58
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.1min finished


F1 score: 1.0
algerianhostagecrisis
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.3min finished


F1 score: 0.930693069307
alqaida
149
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.1min finished


F1 score: 0.986486486486
alshabaab
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.4min finished


F1 score: 0.862745098039
antiwar
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.2min finished


F1 score: 0.865384615385
arabandmiddleeastprotests
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.8min finished


F1 score: 0.882352941176
armstrade
3
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    3.0s finished


F1 score: 1.0
australianguncontrol
92
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.6min finished


F1 score: 1.0
australiansecurityandcounterterrorism
bastilledaytruckattack
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.9min finished


F1 score: 0.857142857143
belgium
berlinchristmasmarketattack
141
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.4min finished


F1 score: 0.965034965035
bigdata
170
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.1min finished


F1 score: 0.976470588235
biometrics
187
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.3min finished


F1 score: 1.0
bokoharam
178
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.2min finished


F1 score: 0.977528089888
bostonmarathonbombing
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.8min finished


F1 score: 0.810810810811
britisharmy
brusselsattacks
102
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.2min finished


F1 score: 0.94
cameroon
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.6min finished


F1 score: 0.873786407767
carers
charliehebdoattack
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.9min finished


F1 score: 0.96
chemicalweapons
109
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 1.0
clusterbombs
35
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   37.3s finished


F1 score: 1.0
cobra
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.7min finished


F1 score: 0.933333333333
conflictanddevelopment
173
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.1min finished


F1 score: 0.972067039106
controversy
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.6min finished


F1 score: 0.742857142857
criminaljustice
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.6min finished


F1 score: 0.831683168317
cybercrime
101
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.0min finished


F1 score: 0.990291262136
cyberwar
24
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   23.2s finished


F1 score: 1.0
darknet
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.6min finished


F1 score: 0.924528301887
dataprotection
18
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   22.3s finished


F1 score: 0.947368421053
debate
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.9min finished


F1 score: 0.833333333333
defence
86
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.0min finished


F1 score: 0.977272727273
deflation
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.938775510204
drones
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.87619047619
drugs
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.924528301887
drugspolicy
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.941176470588
drugstrade
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.0min finished


F1 score: 0.929292929293
earthquakes
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.1min finished


F1 score: 0.990099009901
ebola
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.704761904762
economy
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.930693069307
egypt
34
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   24.2s finished


F1 score: 0.838709677419
encryption
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.844036697248
energy
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.907216494845
espionage
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.711111111111
ethics
87
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   54.4s finished


F1 score: 1.0
europeanarrestwarrant
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.3min finished


F1 score: 0.94
europeancourtofhumanrights
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 150.4min finished


F1 score: 0.88679245283
events
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.970297029703
extradition
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.92
famine
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.877551020408
farright
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.969696969697
firefighters
178
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.988636363636
forensicscience
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.814814814815
france
francetrainattack
Fitting 3 folds for each of 8 candidates, totalling 24 fits


# Continuing to run as had to interrupt script

Edit topic list to choose only topics that haven't been run.

In [11]:
topics = ['freedomofspeech',
 'genevaconventions',
 'germany',
 'guncrime',
 'hacking',
 'hashtags',
 'helicoptercrashes',
 'humanitarianresponse',
 'humanrights',
 'humanrightsact',
 'humantrafficking',
 'immigration',
 'india',
 'indonesia',
 'internallydisplacedpeople',
 'internationalcourtofjustice',
 'internationalcriminaljustice',
 'internetsafety',
 'iraq',
 'isis',
 'israel',
 'jordan',
 'jubilee',
 'judiciary',
 'july7',
 'justiceandsecurity',
 'kenya',
 'knifecrime',
 'lebanon',
 'libya',
 'localgovernment',
 'logistics',
 'london',
 'londonriots',
 'malaysia',
 'mali',
 'malware',
 'metropolitanpolice',
 'middleeastpeacetalks',
 'migration',
 'military',
 'ministryofdefence',
 'morocco',
 'mrsa',
 'mumbaiterrorattacks',
 'munichshooting',
 'naturaldisasters',
 'nigeria',
 'nuclearweapons',
 'occupy',
 'organisedcrime',
 'orlandoterrorattack',
 'osamabinladen',
 'paris',
 'parisattacks',
 'peaceandreconciliation',
 'philippines',
 'piracy',
 'planecrashes',
 'police',
 'protest',
 'refugees',
 'religion',
 'retirementage',
 'rio20earthsummit',
 'royalairforce',
 'royalnavy',
 'russia',
 'sanbernardinoshooting',
 'saudiarabia',
 'september11',
 'slavery',
 'somalia',
 'southafrica',
 'southchinasea',
 'stopandsearch',
 'surveillance',
 'sydneysiege',
 'syria',
 'taliban',
 'terrorism',
 'thailand',
 'torture',
 'traincrashes',
 'transport',
 'tunisiaattack2015',
 'turkey',
 'turkeycoupattempt',
 'ukcrime',
 'uksecurity',
 'uksupremecourt',
 'undercoverpoliceandpolicing',
 'unitednations',
 'usguncontrol',
 'values',
 'warcrimes',
 'warreporting',
 'weaponstechnology',
 'womeninbusiness',
 'woolwichattack',
 'worldmigration',
 'zikavirus']

In [13]:
for topic in topics:
    if topic in topics_0:
        submission[topic] = 1
#         f1_scores.append(0)
#         grid_search_params.append("none")
    else:
        X, y = sampler(topic, df)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)
        
        #N-GRAM RANGE OF (1,1)
        vectorizer = CountVectorizer(ngram_range=(1,4), stop_words='english')

        # PIPELINE OF VECTORIZER, TF-IDF TRANSFORMER, STANDARD SCALER, MODEL
        pipe = Pipeline([('vectorizer', vectorizer), 
                 ('tfidf', TfidfTransformer()),
                 ('ss', StandardScaler(with_mean=False)),
                 ('clf', clf)])

        # GRIDSEARCH PIPELINE FIT
        grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)

        # PREDICT
        preds = grid_search.best_estimator_.predict(X_test)
        topicf1score = f1_score(y_test, preds)
        print "F1 score:", topicf1score
#         f1_scores.append(topicf1score)
#         grid_search_params.append(grid_search.best_params_)

        # FINAL PREDICTIONS
        preds_final = grid_search.best_estimator_.predict(testdf['text'])

        # WRITE TO CSV
        submission[topic] = preds_final
        
    submission.to_csv('submission_logregngram4.csv')
    print topic
    
time2 = time.time()
time_in_s = (time2-time1)
print 'Function takes around %0.3f seconds to run' % (time_in_s)

# scoredf = pd.DataFrame({'f1': f1_scores, 'params': grid_search_params}, index=topics)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.725490196078
freedomofspeech
84
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.1min finished


F1 score: 0.988235294118
genevaconventions
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.903846153846
germany
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.848484848485
guncrime
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.94
hacking
20
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   10.1s finished


F1 score: 1.0
hashtags
188
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.979166666667
helicoptercrashes
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.0min finished


F1 score: 0.941176470588
humanitarianresponse
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.0min finished


F1 score: 0.694736842105
humanrights
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.92
humanrightsact
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.1min finished


F1 score: 0.9375
humantrafficking
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.1min finished


F1 score: 0.636363636364
immigration
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.0min finished


F1 score: 0.884615384615
india
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.2min finished


F1 score: 0.9375
indonesia
44
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   28.4s finished


F1 score: 0.916666666667
internallydisplacedpeople
70
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   49.7s finished


F1 score: 0.945945945946
internationalcourtofjustice
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.929292929293
internationalcriminaljustice
120
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.6min finished


F1 score: 0.896
internetsafety
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.88
iraq
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.95145631068
isis
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.930693069307
israel
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.94
jordan
175
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.983240223464
jubilee
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.884615384615
judiciary
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.87619047619
july7
152
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.9min finished


F1 score: 0.987012987013
justiceandsecurity
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.948453608247
kenya
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.929292929293
knifecrime
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.949494949495
lebanon
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.897196261682
libya
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.845360824742
localgovernment
151
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.0min finished


F1 score: 1.0
logistics
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.565217391304
london
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.901098901099
londonriots
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.959183673469
malaysia
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.2min finished


F1 score: 0.941176470588
mali
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.905660377358
malware
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.914285714286
metropolitanpolice
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.900900900901
middleeastpeacetalks
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.905263157895
migration
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.857142857143
military
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.923076923077
ministryofdefence
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.952380952381
morocco
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.959183673469
mrsa
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.3min finished


F1 score: 0.95145631068
mumbaiterrorattacks
munichshooting
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.891089108911
naturaldisasters
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.901960784314
nigeria
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.92
nuclearweapons
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.938775510204
occupy
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.838709677419
organisedcrime
orlandoterrorattack
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.95145631068
osamabinladen
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.933333333333
paris
parisattacks
peaceandreconciliation
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.2min finished


F1 score: 0.969072164948
philippines
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 38.6min finished


F1 score: 0.891089108911
piracy
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.950495049505
planecrashes
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.824742268041
police
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.651685393258
protest
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.857142857143
refugees
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.833333333333
religion
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.3min finished


F1 score: 0.970297029703
retirementage
199
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.984924623116
rio20earthsummit
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.921568627451
royalairforce
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.930693069307
royalnavy
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.901960784314
russia
sanbernardinoshooting
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.9min finished


F1 score: 0.942307692308
saudiarabia
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.826923076923
september11
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.851485148515
slavery
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.0min finished


F1 score: 0.949494949495
somalia
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.941176470588
southafrica
34
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   21.8s finished


F1 score: 1.0
southchinasea
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.960784313725
stopandsearch
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.869565217391
surveillance
42
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   25.8s finished


F1 score: 0.976744186047
sydneysiege
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.938775510204
syria
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.92
taliban
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.792452830189
terrorism
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.980392156863
thailand
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.893203883495
torture
50
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   33.6s finished


F1 score: 0.980392156863
traincrashes
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.825688073394
transport
tunisiaattack2015
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.7min finished


F1 score: 0.948453608247
turkey
turkeycoupattempt
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


F1 score: 0.698113207547
ukcrime
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.814814814815
uksecurity
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.979591836735
uksupremecourt
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.950495049505
undercoverpoliceandpolicing
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.8min finished


F1 score: 0.778947368421
unitednations
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.927835051546
usguncontrol
18
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    9.5s finished


F1 score: 1.0
values
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.916666666667
warcrimes
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.811320754717
warreporting
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.834951456311
weaponstechnology
182
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.4min finished


F1 score: 0.977528089888
womeninbusiness
141
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.9min finished


F1 score: 1.0
woolwichattack
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


F1 score: 0.909090909091
worldmigration
zikavirus
Function takes around 21592.568 seconds to run


Scored - .13416
Our score went down after adding n_grams (1,4) 