In [1]:
# import libraries
import numpy as np
import nltk
import os
import pandas as pd
import pickle
import re


nltk.download("stopwords")

from bs4 import BeautifulSoup
import joblib
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier 

[nltk_data] Downloading package stopwords to /home/jorge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read data

In [2]:
# import data
impact_sentences = pd.read_csv("output/output_sentence_extractor.csv")

In [3]:
impact_sentences.head()

Unnamed: 0,sentence,source,label
0,Successful bids for European Union and Nationa...,ref,social_impact
1,The project has reached an international audie...,ref,social_impact
2,Research carried out by the University of Gree...,ref,social_impact
3,"Moreover, the research has informed the govern...",ref,social_impact
4,The findings have provided policy-makers in ed...,ref,social_impact


In [4]:
impact_sentences.shape

(1028, 3)

## Preparing data

Add a numeric column to represent the textual label

In [5]:
impact_sentences['num_label'] = np.where(impact_sentences['label'] == 'social_impact', 1, 0)

In [6]:
impact_sentences.head()

Unnamed: 0,sentence,source,label,num_label
0,Successful bids for European Union and Nationa...,ref,social_impact,1
1,The project has reached an international audie...,ref,social_impact,1
2,Research carried out by the University of Gree...,ref,social_impact,1
3,"Moreover, the research has informed the govern...",ref,social_impact,1
4,The findings have provided policy-makers in ed...,ref,social_impact,1


Store sentences and labels into separate variables

In [7]:
data, labels = impact_sentences['sentence'], impact_sentences['num_label']

## Processing data

Remove any html formatting and any non-alpha numeric characters that may appear in the sentences

In [8]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def sentence_to_words(sentence, steeming=False, lemmatization=False):
    text = BeautifulSoup(sentence, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", sentence.lower()) # Convert to lower case
    words = word_tokenize(text)   # Split string into words
    words = [word for word in words if word not in stopwords.words("english")] # Remove stopwords
    if steeming:    
        words = [stemmer.stem(word) for word in words]
    if lemmatization:
        words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

In [9]:
cache_dir = "cache"  # directory to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(sentences, labels, cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # Try to read data from cache first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache
    else:
        cache_file = 'preprocessed_data.pkl'
    
    # If cache is missing, then do the processing
    if cache_data is None:
        # Preprocess data to obtain words for each sentence
        words = [sentence_to_words(sentence) for sentence in sentences]
        # Write to cache file for future use
        cache_data = dict(words=words, labels=labels)
        with open(os.path.join(cache_dir, cache_file), "wb") as f:
            pickle.dump(cache_data, f)
        print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words, labels = (cache_data['words'], cache_data['labels'])
    
    return words, labels

In [10]:
# Preprocess data
data, labels = preprocess_data(data, labels)

Read preprocessed data from cache file: preprocessed_data.pkl


In [11]:
data[0]

['successful',
 'bids',
 'european',
 'union',
 'national',
 'heritage',
 'lottery',
 'funding',
 'ensured',
 'impact',
 'international',
 'scope',
 'results',
 'disseminated',
 'via',
 'websites',
 'print',
 'publications',
 'media',
 'widest',
 'constituencies']

### Transform to Bag-of-Words Representation

Transform each sentence into a Bag-of-Words feature representation.

In [12]:
def do_BoW_transformation(sentences, 
                          max_features=None,
                          transformation='tc',  # it can be either 'tc' (term_count), 'tf', or 'tfidf'
                          ngram_range=(1,1)):
    if transformation == 'tc':
        vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range, 
                                     preprocessor=lambda x: x, tokenizer=lambda x: x,
                                     lowercase=False)
    elif transformation == 'tf':
        vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range, 
                                     preprocessor=lambda x: x, tokenizer=lambda x: x,
                                     use_idf=False, lowercase=False)
    elif transformation == 'tfidf':
        vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range, 
                                     preprocessor=lambda x: x, tokenizer=lambda x: x,
                                     lowercase=False)
    bow_features = vectorizer.fit_transform(sentences).toarray()
    
    return bow_features, vectorizer.vocabulary_

In [13]:
def extract_BoW_features(sentences, 
                         vocabulary_size=None,
                         transformation='tc',  # it can be either 'tc' (term_count), 'tf', or 'tfidf'
                         ngram_range=(1,1),
                         cache_dir=cache_dir, 
                         cache_file=None):
    """Extract Bag-of-Words for a given set of documents, already preprocessed into words."""
    
    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass
    else:
        cache_file = 'bow_features.pkl'
    
    # If cache is missing, then do the processing
    if cache_data is None:
        features, vocabulary = do_BoW_transformation(sentences, 
                                                     vocabulary_size, 
                                                     transformation, 
                                                     ngram_range)
        cache_data = dict(features=features, vocabulary=vocabulary)
        with open(os.path.join(cache_dir, cache_file), "wb") as f:
            joblib.dump(cache_data, f)  # joblib is an enhanced version of pickle that is more efficient for storing NumPy arrays
        print("Wrote features to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        features, vocabulary = (cache_data['features'], cache_data['vocabulary'])
    
    # Return both the extracted features as well as the vocabulary
    return features, vocabulary

#### Text transformations

In [14]:
transformations = ['tc', 'tf', 'tfidf']
transformed_txt = dict()
max_features=[100, 200, 400, 500, 800]
for transformation in transformations:
    transformed_txt[transformation] = dict()
    for max_feature in max_features:
        cache_file_name = 'bow_features_{0}_{1}.pkl'.format(transformation, max_feature)
        data_transformed, _ = extract_BoW_features(data, 
                                                   vocabulary_size=max_feature, 
                                                   cache_file=cache_file_name, 
                                                   transformation=transformation) 
        transformed_txt[transformation][str(max_feature)] = data_transformed

Read features from cache file: bow_features_tc_100.pkl
Read features from cache file: bow_features_tc_200.pkl
Read features from cache file: bow_features_tc_400.pkl
Read features from cache file: bow_features_tc_500.pkl
Read features from cache file: bow_features_tc_800.pkl
Read features from cache file: bow_features_tf_100.pkl
Read features from cache file: bow_features_tf_200.pkl
Read features from cache file: bow_features_tf_400.pkl
Read features from cache file: bow_features_tf_500.pkl
Read features from cache file: bow_features_tf_800.pkl
Read features from cache file: bow_features_tfidf_100.pkl
Read features from cache file: bow_features_tfidf_200.pkl
Read features from cache file: bow_features_tfidf_400.pkl
Read features from cache file: bow_features_tfidf_500.pkl
Read features from cache file: bow_features_tfidf_800.pkl


Let's do some sanity check

In [15]:
transformed_txt['tf']['200'][0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.40824829, 0.        ,
       0.        , 0.        , 0.40824829, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

## Train and test ML algorithms with the differente text transformations

Trainining, testing, and validating machine learning algorithms that are reported to perform well on unbalanced, small, and textual datasets.

In [16]:
def get_classifier(algorithm_name):
    if algorithm_name == 'NB':
        classifier = GaussianNB()
    elif algorithm_name == 'SVMR':
        classifier = SVC(kernel='rbf', gamma='auto')
    elif algorithm_name == 'SVML':
        classifier = SVC(kernel='linear', gamma='auto')
    elif algorithm_name == 'LR1':
        classifier = LogisticRegression(penalty='l1', solver='liblinear')
    elif algorithm_name == 'LR2':
        classifier = LogisticRegression(penalty='l2', solver='liblinear')
    elif algorithm_name == 'RF':
        params = {
            'n_estimators': 600,
            'min_samples_split': 5,
            'min_samples_leaf': 1,
            'max_features': 'auto',
            'max_depth': 50,
            'bootstrap': True
        }
        classifier = RandomForestClassifier(**params)
    elif algorithm_name == 'XGB':
        classifier = XGBClassifier(objective="binary:logistic", eta=0.2, gamma=4, min_child_weight=6)
    else:
        print("Unknown algorithm: {0}",format(algorithm_name))

    return classifier

In [22]:
def do_cross_validation(kfold, classifier, data, labels):
    scores = {
        'balanced_accuracy': [],
        'f1': [],
        'recall': [],
        'precision': []
    }
    for train_index, test_index in kfold.split(data):   
        kf_X_train, kf_X_test = data[train_index], data[test_index]
        kf_y_train, kf_y_test = labels[train_index], labels[test_index]        
        classifier.fit(kf_X_train, kf_y_train)
        y_pred = classifier.predict(kf_X_test)
        scores['balanced_accuracy'].append(metrics.balanced_accuracy_score(kf_y_test, y_pred))
        scores['f1'].append(metrics.f1_score(kf_y_test, y_pred))
        scores['recall'].append(metrics.recall_score(kf_y_test, y_pred))
        scores['precision'].append(metrics.precision_score(kf_y_test, y_pred))
    return scores

In [23]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
algorithms = ['NB', 'SVMR', 'SVML', 'LR1', 'LR2', 'RF', 'XGB']
outputs = [] 

for transformation in transformations:
    for max_feature in max_features:
        t_data = transformed_txt[transformation][str(max_feature)]
        for algorithm in algorithms:
            print('Transformation: {0}, Max Features: {1}, Algorithm: {2}'.format(transformation, max_feature, 
                                                                                  algorithm))
            classifier = get_classifier(algorithm)
            scores = do_cross_validation(kfold, classifier, t_data, labels)
            outputs.append(
                {
                    'algorithm': algorithm,
                    'transformation': transformation,
                    'max_features': max_feature,
                    'metrics': scores
                }
            )

Transformation: tc, Max Features: 100, Algorithm: NB
Transformation: tc, Max Features: 100, Algorithm: SVMR
Transformation: tc, Max Features: 100, Algorithm: SVML
Transformation: tc, Max Features: 100, Algorithm: LR1
Transformation: tc, Max Features: 100, Algorithm: LR2
Transformation: tc, Max Features: 100, Algorithm: RF
Transformation: tc, Max Features: 100, Algorithm: XGB
Transformation: tc, Max Features: 200, Algorithm: NB
Transformation: tc, Max Features: 200, Algorithm: SVMR
Transformation: tc, Max Features: 200, Algorithm: SVML
Transformation: tc, Max Features: 200, Algorithm: LR1
Transformation: tc, Max Features: 200, Algorithm: LR2
Transformation: tc, Max Features: 200, Algorithm: RF
Transformation: tc, Max Features: 200, Algorithm: XGB
Transformation: tc, Max Features: 400, Algorithm: NB
Transformation: tc, Max Features: 400, Algorithm: SVMR
Transformation: tc, Max Features: 400, Algorithm: SVML
Transformation: tc, Max Features: 400, Algorithm: LR1
Transformation: tc, Max Fea

Save results into a dataframe

In [24]:
output_df = pd.DataFrame(columns=['algorithm', 'transformation', 'max_features', 'balanced_accuracy', 'f1'])
for output in outputs:
    row = {
        'algorithm': output['algorithm'],
        'transformation': output['transformation'],
        'max_features': output['max_features'],
        'mean_balanced_accuracy': round(np.array(output['metrics']['balanced_accuracy']).mean(), 2),
        'mean_recall': round(np.array(output['metrics']['recall']).mean(), 2),
        'mean_precision': round(np.array(output['metrics']['precision']).mean(), 2),
        'mean_f1': round(np.array(output['metrics']['f1']).mean(), 2),
    }
    output_df = output_df.append(row, ignore_index=True)

In [25]:
output_df.head()

Unnamed: 0,algorithm,transformation,max_features,balanced_accuracy,f1,recall
0,NB,tc,100,0.65,0.51,0.55
1,SVMR,tc,100,0.51,0.04,0.02
2,SVML,tc,100,0.63,0.45,0.36
3,LR1,tc,100,0.63,0.45,0.35
4,LR2,tc,100,0.63,0.45,0.36


Look at the maximum Balanced Accuracy

In [26]:
max_balanced_accuracy = output_df['mean_balanced_accuracy'].max()
output_df[output_df['mean_balanced_accuracy'] == max_balanced_accuracy]

Unnamed: 0,algorithm,transformation,max_features,balanced_accuracy,f1,recall
9,SVML,tc,200,0.66,0.5,0.45
17,LR1,tc,400,0.66,0.5,0.43
24,LR1,tc,500,0.66,0.5,0.43


Inspect the maximum Recall

In [28]:
max_recall = output_df['mean_recall'].max()
output_df[output_df['mean_recall'] == max_recall]

Unnamed: 0,algorithm,transformation,max_features,balanced_accuracy,f1,recall
21,NB,tc,500,0.51,0.42,0.74


Inspect the maximum Precision

In [28]:
max_recall = output_df['mean_precision'].max()
output_df[output_df['mean_precision'] == max_recall]

Unnamed: 0,algorithm,transformation,max_features,balanced_accuracy,f1,recall
21,NB,tc,500,0.51,0.42,0.74


Inspect the maximun F1

In [27]:
max_f1 = output_df['mean_f1'].max()
output_df[output_df['mean_f1'] == max_f1]

Unnamed: 0,algorithm,transformation,max_features,balanced_accuracy,f1,recall
0,NB,tc,100,0.65,0.51,0.55


Save results to a csv

In [29]:
output_df.to_csv('./experiments/e_06022020.csv', index=False)

### Split data

Split the data in train and test datasets.

In [170]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(data_tc, labels, test_size=0.20)

Check size of datasets

In [171]:
print("Train: {0} records".format(len(X_train)))

Train: 822 records


In [172]:
print("Test: {0} records".format(len(X_test)))

Test: 206 records


Check the percentage of true cases in the datasets

In [173]:
print("Percentage of true cases in train: {0}%".format(round(100*len(Y_train[Y_train==1])/len(Y_train),0)))

Percentage of true cases in train: 29.0%


In [212]:
print("Percentage of true cases in test: {0}%".format(round(100*len(Y_test[Y_test==1])/len(Y_test), 0)))

Percentage of true cases in test: 30.0%


## Train, Test, and Validate ML Models

Trainining, testing, and validating machine learning algorithms that are reported to perform well on unbalanced, small, and textual datasets.

### Naive-Bayes

#### Train

In [144]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

#### Test

In [145]:
pred = nb_classifier.predict(X_test)

Confusion Matrix

In [146]:
conf_matrix = metrics.confusion_matrix(Y_test, pred)
print(conf_matrix)

[[85 65]
 [17 39]]


F1-Score

In [147]:
f1_score = metrics.f1_score(Y_test, pred)
print(f1_score)

0.48750000000000004


Balanced Accuracy

In [148]:
b_accuracy = metrics.balanced_accuracy_score(Y_test, pred)
print(b_accuracy)

0.631547619047619


### Support Vector Machine (RBF Kernel)

#### Train

In [139]:
svm_rbf_classifier = SVC(kernel='rbf', gamma='auto')
svm_rbf_classifier.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

#### Test

In [140]:
pred = svm_rbf_classifier.predict(X_test)

Confusion Matrix

In [141]:
conf_matrix = metrics.confusion_matrix(Y_test, pred)
print(conf_matrix)

[[150   0]
 [ 56   0]]


F1-Score

In [142]:
f1_score = metrics.f1_score(Y_test, pred)
print(f1_score)

0.0


Balanced Accuracy

In [143]:
b_accuracy = metrics.balanced_accuracy_score(Y_test, pred)
print(b_accuracy)

0.5


### Support Vector Machine (Linear Kernel)

#### Train

In [134]:
svm_linear_classifier = SVC(kernel='linear', gamma='auto')
svm_linear_classifier.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

#### Test

In [135]:
pred = svm_linear_classifier.predict(X_test)

Confusion Matrix

In [136]:
conf_matrix = metrics.confusion_matrix(Y_test, pred)
print(conf_matrix)

[[131  19]
 [ 30  26]]


F1-Score

In [137]:
f1_score = metrics.f1_score(Y_test, pred)
print(f1_score)

0.5148514851485149


Balanced Accuracy

In [138]:
b_accuracy = metrics.balanced_accuracy_score(Y_test, pred)
print(b_accuracy)

0.6688095238095237


### XGBoost

#### Train

In [129]:
xgb_classifier = XGBClassifier(objective="binary:logistic", eta=0.2, gamma=4, min_child_weight=6)
xgb_classifier.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.2, gamma=4,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

#### Test

In [130]:
pred = xgb_classifier.predict(X_test)

Confusion Matrix

In [131]:
conf_matrix = metrics.confusion_matrix(Y_test, pred)
print(conf_matrix)

[[147   3]
 [ 49   7]]


F1-Score

In [132]:
f1_score = metrics.f1_score(Y_test, pred)
print(f1_score)

0.21212121212121213


Balanced Accuracy

In [133]:
b_accuracy = metrics.balanced_accuracy_score(Y_test, pred)
print(b_accuracy)

0.5525


### Random Forest

#### Train

In [175]:
rf_classifier = RandomForestClassifier(n_estimators=1000)
rf_classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### Test

In [176]:
pred = rf_classifier.predict(X_test)

Confusion Matrix

In [177]:
conf_matrix = metrics.confusion_matrix(Y_test, pred)
print(conf_matrix)

[[129  15]
 [ 38  24]]


F1-Score

In [178]:
f1_score = metrics.f1_score(Y_test, pred)
print(f1_score)

0.47524752475247517


Balanced Accuracy

In [179]:
b_accuracy = metrics.balanced_accuracy_score(Y_test, pred)
print(b_accuracy)

0.6414650537634409


Cohen-kappa

In [181]:
cohen_kappa = metrics.cohen_kappa_score(Y_test, pred)
print(cohen_kappa)

0.3163431433938635


#### Validation

In [160]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = []

for train_index, test_index in kfold.split(data_tc):   
    kf_X_train, kf_X_test = data_tc[train_index], data_tc[test_index]
    kf_y_train, kf_y_test = labels[train_index], labels[test_index]
    rf_classifier = RandomForestClassifier(n_estimators=1000)
    rf_classifier.fit(kf_X_train, kf_y_train)
    y_pred = rf_classifier.predict(kf_X_test)
    scores.append(metrics.balanced_accuracy_score(kf_y_test, y_pred))
    
print(scores)

[0.5698839662447257, 0.6281453867660765, 0.6195378151260504, 0.6764964788732395, 0.7157534246575342, 0.5307125307125308, 0.5481735159817351, 0.5743589743589743, 0.5970713273500237, 0.6589513462446859]


In [162]:
np.array(scores).mean()

0.6119084766315577

In [167]:
from sklearn.model_selection import cross_val_score
rf_classifier = RandomForestClassifier(n_estimators=1000)
scores = cross_val_score(rf_classifier,data_tc, labels, scoring="balanced_accuracy", cv=10)
print(scores)

[0.67853881 0.51392694 0.60205479 0.64908676 0.56187215 0.62853881
 0.6109589  0.57945205 0.69027778 0.57222222]


In [168]:
np.array(scores).mean()

0.6086929223744293

#### Hyperparameter tuning

In [186]:
# Set the parameters
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 7, 10]
min_samples_leaf = [1, 2, 3]
bootstrap = [True, False]
# Set parameters grid 
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'min_samples_split': [2, 5, 7, 10], 'min_samples_leaf': [1, 2, 3], 'bootstrap': [True, False]}


In [193]:
rf_classifier = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf_classifier, param_distributions = param_grid, 
                               n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, Y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.1min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [194]:
rf_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}

In [195]:
pred = rf_random.predict(X_test)

Confusion Matrix

In [196]:
conf_matrix = metrics.confusion_matrix(Y_test, pred)
print(conf_matrix)

[[131  13]
 [ 38  24]]


F1-Score

In [197]:
f1_score = metrics.f1_score(Y_test, pred)
print(f1_score)

0.48484848484848475


Balanced Accuracy

In [198]:
b_accuracy = metrics.balanced_accuracy_score(Y_test, pred)
print(b_accuracy)

0.6484094982078853


Cohen-kappa

In [199]:
cohen_kappa = metrics.cohen_kappa_score(Y_test, pred)
print(cohen_kappa)

0.3353157028976338


### Logistic Regression

#### Train

In [154]:
lor_classifier = LogisticRegression(penalty='l2')
lor_classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Test

In [155]:
pred = lor_classifier.predict(X_test)

Confusion Matrix

In [156]:
conf_matrix = metrics.confusion_matrix(Y_test.values, pred)
print(conf_matrix)

[[133  17]
 [ 32  24]]


F1-Score

In [157]:
f1_score = metrics.f1_score(Y_test, pred)
print(f1_score)

0.4948453608247423


Balanced Accuracy

In [159]:
b_accuracy = metrics.balanced_accuracy_score(Y_test, pred)
print(b_accuracy)

0.6576190476190477
