# Machine Learning

---

In [None]:
import pandas as pd
import numpy as np

import re

from plotnine import *
import matplotlib.pyplot as plt
from matplotlib import gridspec

from IPython.display import display

pd.set_option('max_colwidth', None)

## Datasets

---

In [None]:
hm = pd.read_csv('./100mentors_finallized.csv')
hmf = hm\
    .groupby(by='text')['tag']\
    .apply(lambda tags: frozenset(tags))\
    .reset_index(name='tags')
hmf.head(3)

In [None]:
wk = pd.read_csv('./wikipedia_finallized.csv')
wkf = wk\
    .groupby(by='text')['tag']\
    .apply(lambda tags: frozenset(tags))\
    .reset_index(name='tags')
wkf.head(1)

In [None]:
import itertools

HM_CLASSES = frozenset(itertools.chain.from_iterable(hmf['tags']))
WK_CLASSES = frozenset(itertools.chain.from_iterable(wkf['tags']))

CLASSES = frozenset(HM_CLASSES | WK_CLASSES)
NOT_HM_CLASSES = frozenset(HM_CLASSES ^ CLASSES)
NOT_WK_CLASSES = frozenset(WK_CLASSES ^ CLASSES)

print(f'*** All Classes ({len(CLASSES)}) ***')
print()
print('\n'.join(sorted(CLASSES)))

## Preprocessing

---

In [None]:
import nltk

nltk.download('popular')
nltk.download('tagsets')

In [None]:
nltk.help.upenn_tagset()

In [None]:
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

STOPWORDS = set(stopwords.words('english'))
wnl = WordNetLemmatizer()

# https://wordnet.princeton.edu/documentation/wndb5wn
def get_wn_pos(tag):
    if tag.startswith('NN'):
        return 'n' # noun
    elif tag.startswith('VB'):
        return 'v' # verb
    elif tag.startswith('JJ'):
        return 'a' # adjective
    elif tag.startswith('RB'):
        return 'r' # adverb
    else:
        return 'n' # noun => default of lemmatize()

def validate(token):
    if len(token) > 1\
        and token.isalpha()\
        and token not in STOPWORDS:
        return True
    return False

def preprocess(text):
    text = text.encode('ascii', 'ignore').decode('utf-8') # remove non-english chars
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)            # remove hashtags
    text = text.lower()
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    
    lemmatized = []
    for token, tag in tagged:
        if not validate(token):
            continue

        wn_tag = get_wn_pos(tag)
        lemmatized.append( wnl.lemmatize(token, wn_tag) )
    
    text = ' '.join(token for token in lemmatized)

    return text

In [None]:
ex = hmf['text'].values[0]
print('Before')
print('=' * (len('Before') + 1))
print(ex)
print()
print('After')
print('=' * (len('After') + 1))
print(preprocess(ex))

##  Model training with "scikit-learn" & "scikit-multilearn"

---

In [None]:
import warnings

from tempfile import mkdtemp
from shutil import rmtree
from joblib import load, dump
from time import time

import itertools
import collections

# from imblearn.pipeline import Pipeline as ImbPipeline
# from imblearn.over_sampling import SMOTE

# https://github.com/scikit-learn-contrib/imbalanced-learn/issues/340
# from skmultilearn.problem_transform import LabelPowerset

from scipy.sparse import csr_matrix

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.ensemble import RakelD, RakelO

from sklearn.base import BaseEstimator
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# https://scikit-learn.org/stable/modules/model_evaluation.html#multiclass-and-multilabel-classification
from sklearn import metrics

In [None]:
# Transformers
transformers = [
    {
        'id': 'cv',
        'name': 'Count Vectorizer',
        'model': CountVectorizer(),
        'parameters': {
            'cv__preprocessor': [preprocess],
            'cv__max_df': [.6],
            'cv__min_df': [2],
            'cv__binary': [False],
        },
        'parameter_grid': [
            {
                'cv__preprocessor': [preprocess],
                'cv__max_df': [.5, .6, .7],
                'cv__min_df': [1, 2, 5],
                'cv__binary': [False],
            },
            {
                'cv__preprocessor': [preprocess],
                'cv__max_df': [.5, .6, .7],
                'cv__min_df': [1],
                'cv__binary': [True],
            },
        ],
    },
    {
        'id': 'tfidf',
        'name': 'TF-IDF Transformer',
        'model': TfidfTransformer(),
        'parameters': {
            'tfidf__norm': ['l2'],
            'tfidf__use_idf': [True],
            'tfidf__smooth_idf': [True],
        },
        'parameter_grid': [
            {
                'tfidf__norm': ['l2'],
                'tfidf__use_idf': [True],
                'tfidf__smooth_idf': [True],
            },
        ],
    },
]

In [None]:
# Approaches
approaches = [
    {
        # Problem Transformers
        'id': 'pt',
        'name': 'Problem Transformers',
        'models': [
            {
                'id': 'br',
                'name': 'Binary Relevance',
                'model': BinaryRelevance(),
                'parameters': {},
                'parameter_grid': [{}],
            },
            {
                'id': 'cc',
                'name': 'Classifier Chain',
                'model': ClassifierChain(),
                'parameters': {},
                'parameter_grid': [{}],
            },
        ],
        'prefix': '',
    },
    {
        # Ensembles
        'id': 'ens',
        'name': 'Ensembles',
        'models': [
            {
                'id': 'rakeld',
                'name': 'RakelD',
                'model': RakelD(),
                'parameters': {
                    'rakeld__labelset_size': [3]
                },
                'parameter_grid': [
                    {
                        'rakeld__labelset_size': [3],
                    },
                ],
            },
            {
                'id': 'rakelo',
                'name': 'RakelO',
                'model': RakelO(),
                'parameters': {
                    'rakelo__labelset_size': [3]
                },
                'parameter_grid': [
                    {
                        'rakelo__labelset_size': [3],
                    },
                ],
            },   
        ],
        'prefix': 'base_',
    },
]

In [None]:
# Classifiers
base_classifiers = [
    {
        'id': 'sgd',
        'name': 'Stochastic Gradient Descent Classifier',
        'model': SGDClassifier(),
        'parameters': {},
        'parameter_grid': [
            {
                'classifier__loss': ["hinge", "squared_hinge", "modified_huber", "log"],
                'classifier__penalty': ["l2"],
                'classifier__alpha': [1e-4],
            },
        ],
        'supports_parallel': True,
    },
    {
        'id': 'svm',
        'name': 'Support Vector Machine Classifier',
        'model': SVC(),
        'parameters': {
            'classifier__C': [1],
            'classifier__kernel': ['rbf'],
            'classifier__tol': [1e-3],
            'classifier__gamma': ['scale'],
        },
        'parameter_grid': [
            {
                'classifier__C': [1, 1.5, 2],
                'classifier__kernel': ['rbf', 'sigmoid'],
                'classifier__tol': [1e-4, 1e-3],
                'classifier__gamma': ['scale', 2],
            },
        ],
        'supports_parallel': False,
    },
    {
        'id': 'mnb',
        'name': 'Multinomial Naive Bayes Classifier',
        'model': MultinomialNB(),
        'parameters': {},
        'parameter_grid': [
            {
                'classifier__alpha': [1e-2, 1e-1, 1],
                'classifier__fit_prior': [False, True],
            },
        ],
        'supports_parallel': True,
    },
    {
        'id': 'knn',
        'name': 'K Nearest Neighbors Classifier',
        'model': KNeighborsClassifier(),
        'parameters': {
            'classifier__n_neighbors': [5],
            'classifier__weights': ['uniform'],
            'classifier__algorithm': ['auto'],
            'classifier__p': [2],
        },
        'parameter_grid': [
            {
                'classifier__n_neighbors': [1, 5],
                'classifier__weights': ['uniform', 'distance'],
                'classifier__algorithm': ['auto'],
                'classifier__p': [2],
            },
        ],
        'supports_parallel': True,
    },
    {
        'id': 'rf',
        'name': 'Random Forest Classifier',
        'model': RandomForestClassifier(),
        'parameters': {
            'classifier__n_estimators': [100],
            'classifier__criterion': ['gini'],
            'classifier__max_depth': [None],
            'classifier__min_samples_split': [10],
            'classifier__max_features': ['auto'],
        },
        'parameter_grid': [
            {
                'classifier__n_estimators': [100, 400],
                'classifier__criterion': ['gini', 'entropy'],
                'classifier__max_depth': [None, 10],
                'classifier__min_samples_split': [2, 4],
                'classifier__max_features': ['auto'],
            },
        ],
        'supports_parallel': True,
    },
]

In [None]:
def execute(ds, ex=list(), pgs=True, cv=3, error=np.nan, verbose=0, path=''):
    """
    ds [dict]: containing the keys `X_train`, `X_test`, `y_train`, `y_test`
        and their values, datasets
    opg [list(dict)]: override default parameters
    pgs [bool]: GridSearch with param_grid
    error ['raise', num]: GridSearch's error
    verbose [int]: GridSearch's verbose
    
    TODOs:
    - cached transformers in grid-search?, cross-validation problem?
    """
    # approaches
    for approach in approaches:
        
        # approaches' classifiers
        for classifier in approach['models']:
            print(f"{approach['name']} - {classifier['name']}")
            print("=" * (len(approach['name']) + 3 + len(classifier['name'])))
            
            # pipeline
            pipeline = Pipeline(steps=[
                (transformers[0]['id'], transformers[0]['model']),
                (transformers[1]['id'], transformers[1]['model']),
                (classifier['id'], classifier['model'])
            ])
            
            # extra parameters
            extra_parameters = {}
            if classifier['id'] == 'rakelo':
                extra_parameters['rakelo__model_count'] = [ds['y_train'].shape[1] * 2]

            # base classifiers
            for base_classifier in base_classifiers:
                if base_classifier['id'] in ex:
                    continue
                
                # hide warnings
                with warnings.catch_warnings(record=True) as ws:
                    
                    # parameter grid
                    parameter_grid = []
                    if pgs:
                        # base classifier's parameter grids
                        base_classifier_parameter_grids = []
                        for base_classifier_parameter_grid in base_classifier['parameter_grid']:
                            base_classifier_parameter_grids.append({ 
                                f"{classifier['id']}__{approach['prefix']}{key}": 
                                    value for key, value in base_classifier_parameter_grid.items()
                            })

                        # parameter grig combinations
                        parameter_grid_combinations = list(itertools.product(
                            # transformers
                            transformers[0]['parameter_grid'],
                            transformers[1]['parameter_grid'],
                            
                            # approach's classifier
                            classifier['parameter_grid'],
                            [extra_parameters],

                            # base classifier
                            [{f"{classifier['id']}__{approach['prefix']}classifier": [base_classifier['model']]}],
                            base_classifier_parameter_grids,
                        ))

                        # parameter grid
                        for parameter_grid_combination in parameter_grid_combinations:
                            parameter_grid.append(dict(collections.ChainMap(*parameter_grid_combination)))                    

                    else: # if not parameter grid search
                        # base classifier's parameter grid
                        base_classifier_parameter_grid = { 
                            f"{classifier['id']}__{approach['prefix']}{key}": 
                                value for key, value in base_classifier['parameters'].items()
                        }

                        # parameter grid
                        parameter_grid.append({
                            # transformers
                            **transformers[0]['parameters'],
                            **transformers[1]['parameters'],

                            # approach's classifier
                            **classifier['parameters'],
                            **extra_parameters,

                            # base classifier
                            f"{classifier['id']}__{approach['prefix']}classifier": [base_classifier['model']],
                            **base_classifier_parameter_grid,
                        })

                    print(f"{base_classifier['name']}", end=': ')

                    # grid search
                    gs = GridSearchCV(
                        pipeline, 
                        parameter_grid,
                        scoring='jaccard_samples', 
                        n_jobs=-1 if base_classifier['supports_parallel'] else 1, # if supported from base classifier
                        pre_dispatch='2*n_jobs',
                        cv=cv,
                        verbose=verbose, 
                        error_score=error
                    )
                    
                    # time start
                    t1 = time()

                    # train
                    gs.fit(ds['X_train'], ds['y_train'])

                    # time finish
                    t2 = time()
                    
                    # time difference in minutes
                    td = t2 - t1

                    # predict
                    predicted = gs.predict(ds['X_test'])

                    # model's results
                    model = {
                        'id': f"{path}{approach['id']}_{classifier['id']}_{base_classifier['id']}",
                        'name': base_classifier['name'],
                        'model': gs,
                        'metrics': {
                            'accuracy': metrics.jaccard_score(ds['y_test'], predicted, average='samples'),
                            'subset_accuracy': metrics.accuracy_score(ds['y_test'], predicted),
                            'precision': metrics.precision_score(ds['y_test'], predicted, average='samples'),
                            'recall': metrics.recall_score(ds['y_test'], predicted, average='samples'),
                            'f1': metrics.f1_score(ds['y_test'], predicted, average='samples')
                        },
                        'warnings': [str(w) for w in ws],
                    }

                    td_str = f"{td:.2f}s" if td < 100 else f"{td/60:.2f}m" if td < 10000 else f"{td/3600:.2f}h" 
                    print(f"{model['metrics']['accuracy']:.2f}", f"[{td_str}]", f"({len(model['warnings'])})")

                    # save model to file
                    dump([model, gs], f"{path}{approach['id']}_{classifier['id']}_{base_classifier['id']}.gz", compress=9)

            print()

### 4.1 Training on 100mentors → Testing on 100mentors

In [None]:
mlb_hm = MultiLabelBinarizer(classes=np.array(sorted(HM_CLASSES)), sparse_output=True)

In [None]:
yhm = mlb_hm.fit_transform(hmf['tags'].values)
Xhm = hmf['text'].values

print(Xhm.shape, yhm.shape)

In [None]:
print('Tags:', ', '.join(mlb_hm.classes[np.where(yhm.toarray()[0] == 1)]))
print('y:', yhm.toarray()[0])
print('X:', Xhm[0])

In [None]:
Xhm_train, Xhm_test, yhm_train, yhm_test = train_test_split(Xhm, yhm, test_size=1/5, random_state=3, shuffle=True)

print('train dataset:', Xhm_train.shape, yhm_train.shape)
print('test dataset:', Xhm_test.shape, yhm_test.shape)

In [None]:
hm_ds = {
    'X_train': Xhm_train,
    'X_test': Xhm_test,
    'y_train': yhm_train,
    'y_test': yhm_test,
}

In [None]:
hm_models = execute(hm_ds, error='raise', path='./clfs/100mentors/')

### 4.2 Training on 100mentors & Wikipedia → Testing on 100mentors

In [None]:
mlb = MultiLabelBinarizer(classes=np.array(sorted(CLASSES)), sparse_output=True)

In [None]:
yhm_acl = mlb.fit_transform(hmf['tags'].values)
Xhm_acl = hmf['text'].values

print(Xhm_acl.shape, yhm_acl.shape)

In [None]:
Xhm_acl_train, X_test, yhm_acl_train, y_test = train_test_split(Xhm_acl, yhm_acl, test_size=1/3, random_state=3, shuffle=True)

print('test dataset:', X_test.shape, y_test.shape)

In [None]:
ywk = mlb.fit_transform(wkf['tags'].values)
Xwk = wkf['text'].values

y_train = csr_matrix(np.concatenate([yhm_acl_train.toarray(), ywk.toarray()], axis=0))
X_train = np.concatenate([Xhm_acl_train, Xwk], axis=0)

print('train dataset:', X_train.shape, y_train.shape)

In [None]:
wk_ds = {
    'X_train': X_train,
    'X_test': X_test, 
    'y_train': y_train, 
    'y_test': y_test,
}

ex = [
    'svm',
    'mnb',
]

In [None]:
wk_models = execute(wk_ds, ex=ex, pgs=False, path='./clfs/wiki/')