<h1 align='center'>AutoSynthesis study group</h1>
<h2 align='center'> Session 4 - Cross Validation </h2>
<h3 align='right'> 26th April 2019 </h3>
<h3 align='right'> Kazeem </h3>

### Load libraries

In [1]:
from __future__ import print_function

import logging
print(__doc__)
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB, BernoulliNB
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn_evaluation import plot


print ('Packages import successful')

Automatically created module for IPython interactive environment
Packages import successful


### Load your dataset

In [2]:
data = pd.read_csv('autosynthesis_session3.csv') #set the data path relative to your system and file location
print ('Dataset loaded successfully')
data.head(5) #view some samples

le = LabelEncoder()
data['labels'] = le.fit_transform(data['label'])
X = data[['Title', 'Abstract', 'Keywords']].apply(lambda x: '{} {} {}'.format(x[0], x[1], x[2]), axis=1)
y = data['labels']

Dataset loaded successfully


### preprocessing

In [3]:
#optionally write custom preprocessing method.....WHY?
def preprocessor(text):
    #text = text.apply(lambda x: ' '.join(x.lower().replace('[^\w\s]','') for x in str(x).split() if not x in set(stopwords.words('english')) and not x.isdigit()))
    
    # split into words
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    #from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words and len(w) > 3]
    
    return ' '.join(words) #return the cleaned text string separated by spaces

#### Build model with cross validation

In [39]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=None)
#skf = RepeatedStratifiedKFold(n_splits = 10, n_repeats=10, random_state=None)

svm_recall = []
svm_accuracy = []
svm_precision = []
bnb_recall = []
bnb_accuracy = []
bnb_precision = []

svm_clf = SVC(C = 10, kernel = 'linear', class_weight=None, gamma = 'scale', random_state=None)
bnb_clf = BernoulliNB()
print("---------------USING TFIDF REPRESENTATION---------------\n")
for train_index, test_index in skf.split(X, y):
    #print("Train:", train_index, "Validation:", test_index) 
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    
    #preprocess
    X_train = X_train.apply(lambda x: preprocessor(x))
    X_test = X_test.apply(lambda x: preprocessor(x))
    
    #feature representation
    tfidf_encoder = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=3, ngram_range=(1, 1))
    X_train = tfidf_encoder.fit_transform(X_train)
    X_test = tfidf_encoder.transform(X_test)
    
    #select features
    bestfeatures = SelectKBest(score_func=chi2, k=500)
    X_train = bestfeatures.fit_transform(X_train,y_train)
    X_test = bestfeatures.transform(X_test)
    
    svm_model = clf.fit(X_train, y_train)
    svm_predictions = svm_model.predict(X_test)
    
    svm_recall.append(recall_score(y_test, svm_predictions, average= 'micro'))
    svm_accuracy.append(accuracy_score(y_test, svm_predictions))
    svm_precision.append(precision_score(y_test, svm_predictions, average= 'micro'))
    

    
    ##Bernoulli
    bnb_model = bnb_clf.fit(X_train.toarray(), y_train)
    bnb_predictions = bnb_model.predict(X_test.toarray())
    
    bnb_recall.append(recall_score(y_test, svm_predictions, average= 'micro'))
    bnb_accuracy.append(accuracy_score(y_test, svm_predictions))
    bnb_precision.append(precision_score(y_test, svm_predictions, average= 'micro'))
    
print("---------------SVM PERFORMANCE-----------------")
print(f'SVM Mean Accuracy = {np.mean(svm_accuracy):.2f} +/- {np.std(svm_accuracy):.2f}')
print(f"SVM Mean Recall = {np.mean(svm_recall):.2f} +/- {np.std(svm_recall):.2f}")
print(f'SVM Mean Precision = {np.mean(svm_precision):.2f} +/- {np.std(svm_precision):.2f}')
    
print("\n")
print("--------------BNB PERFORMANCE-----------------")
print(f'SVM Mean Accuracy = {np.mean(bnb_accuracy):.2f} +/- {np.std(bnb_accuracy):.2f}')
print(f"SVM Mean Recall = {np.mean(bnb_recall):.2f} +/- {np.std(bnb_recall):.2f}")
print(f'SVM Mean Precision = {np.mean(bnb_precision):.2f} +/- {np.std(bnb_precision):.2f}')

---------------USING TFIDF REPRESENTATION---------------
---------------SVM PERFORMANCE-----------------
SVM Mean Accuracy = 0.90 +/- 0.04
SVM Mean Recall = 0.90 +/- 0.04
SVM Mean Precision = 0.90 +/- 0.04


--------------BNB PERFORMANCE-----------------
SVM Mean Accuracy = 0.90 +/- 0.04
SVM Mean Recall = 0.90 +/- 0.04
SVM Mean Precision = 0.90 +/- 0.04


In [43]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=None)
#skf = RepeatedStratifiedKFold(n_splits = 10, n_repeats=10, random_state=None)

svm_recall = []
svm_accuracy = []
svm_precision = []
bnb_recall = []
bnb_accuracy = []
bnb_precision = []

svm_clf = SVC(C = 10, kernel = 'linear', class_weight=None, gamma = 'scale', random_state=None)
bnb_clf = BernoulliNB()
print("-----------------USING BINARY REPRESENTATION---------------\n")
for train_index, test_index in skf.split(X, y):
    #print("Train:", train_index, "Validation:", test_index) 
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    
    #preprocess
    X_train = X_train.apply(lambda x: preprocessor(x))
    X_test = X_test.apply(lambda x: preprocessor(x))
    
    #feature representation
    binary_encoder = TfidfVectorizer(stop_words='english', binary = True, max_df=0.8, min_df=3, ngram_range=(1, 1))
    X_train = binary_encoder.fit_transform(X_train)
    X_test = binary_encoder.transform(X_test)
    
    #select features
    bestfeatures = SelectKBest(score_func=chi2, k=500)
    X_train = bestfeatures.fit_transform(X_train,y_train)
    X_test = bestfeatures.transform(X_test)
    
    svm_model = clf.fit(X_train, y_train)
    svm_predictions = svm_model.predict(X_test)
    
    svm_recall.append(recall_score(y_test, svm_predictions, average= 'micro'))
    svm_accuracy.append(accuracy_score(y_test, svm_predictions))
    svm_precision.append(precision_score(y_test, svm_predictions, average= 'micro'))
    

    
    ##Bernoulli
    bnb_model = bnb_clf.fit(X_train.toarray(), y_train)
    bnb_predictions = bnb_model.predict(X_test.toarray())
    
    bnb_recall.append(recall_score(y_test, svm_predictions, average= 'micro'))
    bnb_accuracy.append(accuracy_score(y_test, svm_predictions))
    bnb_precision.append(precision_score(y_test, svm_predictions, average= 'micro'))
    
print("---------------SVM PERFORMANCE-----------------")
print(f'SVM Mean Accuracy = {np.mean(svm_accuracy):.2f} +/- {np.std(svm_accuracy):.2f}')
print(f"SVM Mean Recall = {np.mean(svm_recall):.2f} +/- {np.std(svm_recall):.2f}")
print(f'SVM Mean Precision = {np.mean(svm_precision):.2f} +/- {np.std(svm_precision):.2f}')
    
print("\n")
print("---------------BNB PERFORMANCE-----------------")
print(f'SVM Mean Accuracy = {np.mean(bnb_accuracy):.2f} +/- {np.std(bnb_accuracy):.2f}')
print(f"SVM Mean Recall = {np.mean(bnb_recall):.2f} +/- {np.std(bnb_recall):.2f}")
print(f'SVM Mean Precision = {np.mean(bnb_precision):.2f} +/- {np.std(bnb_precision):.2f}')

-----------------USING BINARY REPRESENTATION---------------

---------------SVM PERFORMANCE-----------------
SVM Mean Accuracy = 0.90 +/- 0.03
SVM Mean Recall = 0.90 +/- 0.03
SVM Mean Precision = 0.90 +/- 0.03


---------------BNB PERFORMANCE-----------------
SVM Mean Accuracy = 0.90 +/- 0.03
SVM Mean Recall = 0.90 +/- 0.03
SVM Mean Precision = 0.90 +/- 0.03
