In [1]:
import nltk
from nltk.corpus import stopwords
from stemming.porter2 import stem
import re

def splitText(text):
     #split text by spaces and all symbols.
    word_list = re.findall(r"[\w']+", text)
    word_list = [re.sub(r"\w*[\d]+\w*", 'Numxyzabcd',s) for s in word_list]
    return word_list

def removeStopWords(word_list):
    """ Removes stop words from text """
    
    cachedStopWords = set(stopwords.words("english"))    
    filtered_words = [w for w in word_list if not w in cachedStopWords]    
    return filtered_words

def stemWords(word_list):
    stemmedWords = [stem(w) for w in word_list]
    return stemmedWords

def preProcessData(abstract):
    #preprocessing: stopword removal and stemming       
    word_list = splitText(abstract)
    word_list = removeStopWords(word_list)
    word_list = stemWords(word_list)
    return ' '.join(word_list)


In [2]:
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import ShuffleSplit
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer

x_train = pickle.load(open('../../data/x_train_freq.pkl',"rb"))
y_train = pickle.load(open('../../data/y_train.pkl',"rb"))


pipeline = Pipeline([
    ('tfidf', TfidfTransformer(smooth_idf=True)),
    ('selectk', SelectKBest(score_func=chi2, k=30000)), 
    ('svc', svm.SVC(kernel='rbf', C=5)),        
])

parameters = {
    #'pca__n_components': (100, 1000, 5000, 10000, 20000),
   #'svc__C': np.logspace(1,3,3),
    'svc__gamma':np.logspace(-5,3,4)
}


if __name__ == "__main__": #important for parallel processing with n_jobs
    #cv_split = ShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    gridSearchEst = GridSearchCV(pipeline, parameters, n_jobs=64, verbose=3, error_score=0, cv=2)
    gridSearchEst.fit(x_train, y_train)
    
    print("Best score: %0.3f" % gridSearchEst.best_score_)
    print("Best parameters set:")
    best_parameters = gridSearchEst.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    print(gridSearchEst.cv_results_['mean_test_score'])
    print(gridSearchEst.cv_results_['mean_train_score'])
    
pickle.dump(gridSearchEst.best_estimator_, open('data/fittedGridSearchEstimator.pkl',"wb"))    
pickle.dump(gridSearchEst.cv_results_, open('data/cvResults.pkl',"wb"))

Fitting 2 folds for each of 4 candidates, totalling 8 fits




[CV] svc__gamma=1e-05 ................................................
[CV] svc__gamma=1e-05 ................................................
[CV] svc__gamma=0.00464158883361 .....................................
[CV] svc__gamma=0.00464158883361 .....................................
[CV] svc__gamma=2.15443469003 ........................................
[CV] svc__gamma=2.15443469003 ........................................
[CV] svc__gamma=1000.0 ...............................................
[CV] svc__gamma=1000.0 ...............................................
[CV] ............ svc__gamma=0.00464158883361, score=0.884790 -38.9min
[CV] ............ svc__gamma=0.00464158883361, score=0.883539 -39.6min
[CV] ....................... svc__gamma=1e-05, score=0.318019 -55.9min
[CV] ....................... svc__gamma=1e-05, score=0.318018 -55.8min


[Parallel(n_jobs=64)]: Done   4 out of   8 | elapsed: 210.3min remaining: 210.3min


[CV] ............... svc__gamma=2.15443469003, score=0.906309 -24.8min
[CV] ............... svc__gamma=2.15443469003, score=0.905399 -25.3min
[CV] ...................... svc__gamma=1000.0, score=0.318950 -15.9min
[CV] ...................... svc__gamma=1000.0, score=0.319231 -16.1min


[Parallel(n_jobs=64)]: Done   8 out of   8 | elapsed: 396.8min finished


Best score: 0.906
Best parameters set:
	svc__gamma: 2.154434690031882
[ 0.31801836  0.88416466  0.90585382  0.31909013]
[ 0.31801836  0.89295033  0.99919618  0.99919618]


## Step 4: Run on test set

In [12]:
import pandas as pd
import pickle
import numpy as np

#load estimators (vectoriser, label encoder, grid search)
bestEstimator = pickle.load(open('data/fittedGridSearchEstimator.pkl',"rb"))

#load test data
x_test = pickle.load(open('../../data/x_test_freq.pkl',"rb"))
y_test = pickle.load(open('../../data/y_test.pkl',"rb"))

#execute learner estimator (grid search)
y_test_pred = bestEstimator.predict(x_test)

accuracy = sum(y_test_pred == y_test) / float(len(y_test))
accuracy

In [3]:
1

1

17.75

16