# Table of Contents: <a class="anchor" id="toc"></a>
* [Training Pipeline](#pipeline)
 * [Data extraction](#data-extraction)
 * [Data formatting](#data-formatting)
 * [Modeling and Validation](#modelling)
 * [Model exportation](#model-exportation)

# Training Pipeline <a class="anchor" id="pipeline"></a> [↑](#toc)

In [10]:
import os
import pandas as pd
import numpy as np
import json
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

### Data extraction <a class="anchor" id="data-extraction"></a> [↑](#toc)

In [3]:
data_path = os.environ['DATASET_PATH']
df = pd.read_csv(data_path)

### Data formatting <a class="anchor" id="data-formatting"></a> [↑](#toc)

In [4]:
class ConcatenateTextColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.fillna('')
        X = X.agg(' '.join, axis=1)
        return X

In [5]:
pipe = Pipeline(
    steps=[
        ("ConcatenateTextColumns", ConcatenateTextColumns()),
        ("CountVectorizer", CountVectorizer(max_features=200)),
    ],
)

In [6]:
text_columns = ['query', 'concatenated_tags']
X = pipe.fit_transform(df[text_columns])
y = df['category']

### Modeling and Validation <a class="anchor" id="modelling"></a> [↑](#toc)

In [7]:
def model_validation(X, y, model, n_splits=5, n_repeats=5):
    
    rkf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=20)
    
    # training loop
    train_precisions = np.array([])
    train_recalls = np.array([])
    train_f1_scores = np.array([])
    val_precisions = np.array([])
    val_recalls = np.array([])
    val_f1_scores = np.array([])
    for train, val in rkf.split(X, y):
        
        # model training
        model.fit(X[train], y[train])
        
        # model prediction probabilities
        train_preds = model.predict(X[train])
        val_preds = model.predict(X[val])
        
        ### test macro
        train_precisions = np.append(train_precisions, precision_score(train_preds, y[train], average='macro'))
        train_recalls = np.append(train_recalls, recall_score(train_preds, y[train], average='macro'))
        train_f1_scores = np.append(train_f1_scores, f1_score(train_preds, y[train], average='macro'))
        val_precisions = np.append(val_precisions, precision_score(val_preds, y[val], average='macro'))
        val_recalls = np.append(val_recalls, recall_score(val_preds, y[val], average='macro'))
        val_f1_scores = np.append(val_f1_scores, f1_score(val_preds, y[val], average='macro'))
        
    report = {
        'train_precision': {'mean': np.mean(train_precisions), 'std': np.std(train_precisions)},
        'train_recall': {'mean': np.mean(train_recalls), 'std': np.std(train_recalls)},
        'train_f1_score': {'mean': np.mean(train_f1_scores), 'std': np.std(train_f1_scores)},
        'val_precision': {'mean': np.mean(val_precisions), 'std': np.std(val_precisions)},
        'val_recall': {'mean': np.mean(val_recalls), 'std': np.std(val_recalls)},
        'val_f1_score': {'mean': np.mean(val_f1_scores), 'std': np.std(val_f1_scores)},
    }
    
    return report, model

In [8]:
report, model = model_validation(X, y, RandomForestClassifier(), n_repeats=2)

In [14]:
metrics_path = os.environ['METRICS_PATH']
with open(metrics_path, 'w') as file:
     file.write(json.dumps(report))

### Model exportation <a class="anchor" id="model-exportation"></a> [↑](#toc)

In [16]:
model_path = os.environ['MODEL_PATH']
joblib.dump(model, model_path) 

['model.pkl']