<h1 align='center'>AutoSynthesis study group</h1>
<h2 align='center'> Session 6 - Putting it altogether </h2>
<h3 align='right'> 22nd may 2019 </h3>
<h3 align='right'> Kazeem </h3>

In [26]:
from __future__ import print_function

import logging
print(__doc__)
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import BernoulliNB
from nltk.corpus import stopwords


print ('Packages import successful')

Automatically created module for IPython interactive environment
Packages import successful


#### Load dataset

In [27]:
data = pd.read_csv('autosynthesis_session3.csv') #set the data path relative to your system and file location
print ('Dataset loaded successfully')
data.head(5) #view some samples

le = LabelEncoder()
data['labels'] = le.fit_transform(data['label'])
X = data[['Title', 'Abstract', 'Keywords']].apply(lambda x: '{} {} {}'.format(x[0], x[1], x[2]), axis=1)
y = data['labels']

Dataset loaded successfully


In [28]:
#split the data to keep part exclusively for testing/validation
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=19)

In [29]:
#optionally write custom preprocessing method.....WHY?
def preprocessor(text):
    #text = text.apply(lambda x: ' '.join(x.lower().replace('[^\w\s]','') for x in str(x).split() if not x in set(stopwords.words('english')) and not x.isdigit()))
    
    # split into words
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    #from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words and len(w) > 3]
    
    return ' '.join(words) #return the cleaned text string separated by spaces

In [30]:
X_train = X_train.apply(lambda x: preprocessor(x))
X_test = X_test.apply(lambda x: preprocessor(x))

#from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_encoder = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=3, ngram_range=(1, 1))
tfidf_train_data = tfidf_encoder.fit_transform(X_train)
tfidf_test_data = tfidf_encoder.transform(X_test)

#reduce data dimension
bestfeatures = SelectKBest(score_func=chi2, k=200)
tf_train = bestfeatures.fit_transform(tfidf_train_data,y_train)
tf_test = bestfeatures.fit_transform(tfidf_test_data,y_test)

### Select best parameters with grid search

In [31]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4, 'scale'],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score, iid=False)
    clf.fit(tf_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on development set:

0.426 (+/-0.003) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.426 (+/-0.003) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.426 (+/-0.003) for {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
0.426 (+/-0.003) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.426 (+/-0.003) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.890 (+/-0.065) for {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
0.426 (+/-0.003) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.426 (+/-0.003) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.891 (+/-0.055) for {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
0.912 (+/-0.109) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.426 (+/-0.003) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.886 (+/-0.066) for {'C': 1000, 'gamma': 'scale', 'kernel': 'rbf'}
0.942 (+/-0.018) for {'C': 1, 'kernel': 'linear'}
0.856 (+/-0.084) for {'C': 10, 'kernel': 'linear'}
0.8

In [32]:
clf.best_estimator_

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### Model and Feature selection together

In [33]:
#combine preprocessing, dimensionality reduction with classification

pipe = Pipeline([
    ('vect', TfidfVectorizer()),
    ('feature_selection', SelectKBest(chi2)),
    ('clf', SVC(gamma='scale'))
])

#set different options for parameters
param_grid = {
    'vect__max_df': (0.6,0.7,0.8), 
    'vect__min_df': (2, 3, 4),
    'feature_selection__k': (20, 50, 100, 200, 250),
    'clf__kernel': ('rbf', 'linear'),
    'clf__C': (1, 10, 100, 1000),
}

In [34]:
#instanciate the grid search
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, iid=False)

#fit the search
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'vect__max_df': (0.6, 0.7, 0.8), 'vect__min_df': (2, 3, 4), 'feature_selection__k': (20, 50, 100, 200, 250), 'clf__kernel': ('rbf', 'linear'), 'clf__C': (1, 10, 100, 1000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [36]:
grid.best_params_

{'clf__C': 1000,
 'clf__kernel': 'rbf',
 'feature_selection__k': 250,
 'vect__max_df': 0.6,
 'vect__min_df': 3}

### Multiple feature selection and classification methods

In [37]:
#Step 1: create a pipeline object combining all the steps in order
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('reduce_dim', None),
    ('clf', SVC(gamma='scale'))
])

#Step 2: set optional parameters for the different objects in the pipeline
FEATURE_SIZE = [50, 100, 250, 350, 500]
param_grid = [{
    'vect__max_df': (0.5, 0.7, 0.8),
    'vect__min_df': (2,3,5),
    'vect__binary': (True, False),
    'vect__ngram_range': [(1, 1), (1,2)],
    'reduce_dim': [TruncatedSVD(n_iter=5)],
    'reduce_dim__n_components': FEATURE_SIZE,
    'clf__C': [1, 10, 100] 
},
{
    'vect__max_df': (0.5, 0.7, 0.8),
    'vect__min_df': (2,3,5),
    'vect__binary': (True, False),
    'vect__ngram_range': [(1, 1), (1,2)],
    'reduce_dim': [SelectKBest(chi2)],
    'reduce_dim__k': FEATURE_SIZE,
    'clf__C': [1, 10, 100] 
}]

#### Run the pipeline to select best model 

In [38]:
#pass the pipeline to an instance of the GridSearch CV
grid = GridSearchCV(pipeline, cv=5, n_jobs=1, param_grid=param_grid, iid=False)

#fit the GridSearchCV instance
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=False, n_jobs=1,
       param_grid=[{'vect__max_df': (0.5, 0.7, 0.8), 'vect__min_df': (2, 3, 5), 'vect__binary': (True, False), 'vect__ngram_range': [(1, 1), (1, 2)], 'reduce_dim': [TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)], 'reduce_dim__n_components': [50, 100, 250...ction chi2 at 0x7fd0678b7e18>)], 'reduce_dim__k': [50, 100, 250, 350, 500], 'clf__C': [1, 10, 100]}],
       pre_dispatch='2*n_jobs', refit=Tru

In [39]:
mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(3, -1, len(FEATURE_SIZE))
# select score for best C
mean_scores = mean_scores.max(axis=0)

In [40]:
mean_scores

array([[0.88779762, 0.89217687, 0.89213435, 0.88796769, 0.89421769],
       [0.89613095, 0.89417517, 0.89009354, 0.88796769, 0.89213435],
       [0.89196429, 0.89613095, 0.88988095, 0.89005102, 0.89009354],
       [0.89005102, 0.89829932, 0.89613095, 0.90446429, 0.91488095],
       [0.91071429, 0.91492347, 0.90659014, 0.90659014, 0.90858844],
       [0.90654762, 0.92104592, 0.90867347, 0.90663265, 0.90667517],
       [0.90871599, 0.90046769, 0.91071429, 0.91483844, 0.90663265],
       [0.9107568 , 0.8962585 , 0.89621599, 0.8962585 , 0.89409014],
       [0.89834184, 0.90238095, 0.89421769, 0.89621599, 0.89009354],
       [0.89421769, 0.90042517, 0.8982568 , 0.88805272, 0.89621599],
       [0.89217687, 0.89409014, 0.89213435, 0.8982568 , 0.91071429],
       [0.9025085 , 0.90659014, 0.90871599, 0.90459184, 0.91488095],
       [0.90238095, 0.89829932, 0.90863095, 0.90454932, 0.89834184],
       [0.91084184, 0.90659014, 0.90246599, 0.90459184, 0.90242347],
       [0.90459184, 0.90042517, 0.

In [41]:
grid.best_params_

{'clf__C': 100,
 'reduce_dim': TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
        random_state=None, tol=0.0),
 'reduce_dim__n_components': 50,
 'vect__binary': False,
 'vect__max_df': 0.7,
 'vect__min_df': 3,
 'vect__ngram_range': (1, 1)}

In [42]:
grid.best_score_

0.921045918367347

In [43]:
best_estimator = grid.best_estimator_
print (best_estimator)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])


In [44]:
#use the best model to predict a future article
x = ['''The use of bibliography enriched features for automatic citation screening Citation screening (also called study selection) is a phase of systematic review process that has attracted 
a growing interest on the use of text mining (TM) methods to support it to reduce time and effort. Search results are usually imbalanced between the relevant and the irrelevant classes of returned citations. Class imbalance among other factors has been a persistent problem that impairs the performance of TM models, particularly in the context of automatic citation screening for systematic reviews. This has often caused the performance of classification models using the basic title and abstract data to ordinarily fall short of expectations.
In this study, we explore the effects of using full bibliography data in addition to title and abstract on text 
classification performance for automatic citation screening.
We experiment with binary and Word2vec feature representations and SVM models using 4 software engineering (SE) 
and 15 medical review datasets. We build and compare 3 types of models (binary-non-linear, Word2vec-linear and 
Word2vec-non-linear kernels) with each dataset using the two feature sets.

The bibliography enriched data exhibited consistent improved performance in terms of recall, work saved over 
sampling (WSS) and Matthews correlation coefficient (MCC) in 3 of the 4 SE datasets that are fairly large in size. For the medical datasets, the results vary, however in the majority of cases the performance is the same or better.
Inclusion of the bibliography data provides the potential of improving the performance of the models but to date 
results are inconclusive. Citation screening automation; Computing methodologies; Feature enrichment; Systematic reviews; Text mining''']

In [45]:
grid.best_estimator_.predict(x)

array([1])

#### Another example with Bernuoli NB

In [46]:
#Step 1: create a pipeline object combining all the steps in order
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('reduce_dim', None),
    ('clf', BernoulliNB())
])

#Step 2: set optional parameters for the different objects in the pipeline
FEATURE_SIZE = [50, 100, 250, 350, 500]
param_grid = [{
    'vect__max_df': (0.5, 0.7, 0.8),
    'vect__min_df': (2,3,5),
    'vect__binary': (True, False),
    'vect__ngram_range': [(1, 1), (1,2)],
    'reduce_dim': [TruncatedSVD(n_iter=5)],
    'reduce_dim__n_components': FEATURE_SIZE
},
{
    'vect__max_df': (0.5, 0.7, 0.8),
    'vect__min_df': (2,3,5),
    'vect__binary': (True, False),
    'vect__ngram_range': [(1, 1), (1,2)],
    'reduce_dim': [SelectKBest(chi2)],
    'reduce_dim__k': FEATURE_SIZE
}]

In [47]:
#pass the pipeline to an instance of the GridSearch CV
grid = GridSearchCV(pipeline, cv=5, n_jobs=1, param_grid=param_grid, iid=False)

#fit the GridSearchCV instance
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...reduce_dim', None), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=False, n_jobs=1,
       param_grid=[{'vect__max_df': (0.5, 0.7, 0.8), 'vect__min_df': (2, 3, 5), 'vect__binary': (True, False), 'vect__ngram_range': [(1, 1), (1, 2)], 'reduce_dim': [TruncatedSVD(algorithm='randomized', n_components=500, n_iter=5,
       random_state=None, tol=0.0)], 'reduce_dim__n_components': [50, 100, 25...t(k=100, score_func=<function chi2 at 0x7fd0678b7e18>)], 'reduce_dim__k': [50, 100, 250, 350, 500]}],
       pre_dispatch='2*n_jobs', refit=Tru

In [48]:
grid.best_params_

{'reduce_dim': SelectKBest(k=100, score_func=<function chi2 at 0x7fd0678b7e18>),
 'reduce_dim__k': 100,
 'vect__binary': True,
 'vect__max_df': 0.5,
 'vect__min_df': 3,
 'vect__ngram_range': (1, 2)}

In [49]:
grid.best_score_

0.9065051020408165

In [50]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
 ... 0x7fd0678b7e18>)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))])

In [51]:
grid.best_estimator_.predict(x)

array([1])