In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm

pd.options.display.max_colwidth = 200

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:5]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Text classification using Machine Learning

In this notebook we learn how to classify texts using machine learning.

### Table of Contents

* Classification using few ML techniques
    * Logistic Regression
    * Naive Bayes
    * Random Forest
* Cross Validation, Model evaluation
* Model interpretation using ELI5
* Hyperparameter tuning
* Ensemble

In [None]:
df = pd.read_csv('/kaggle/input/nlp-specialization-data/Cleaned_POS_Medical_Notes.csv') #for excel file use read_excel
df

Let us first check the distribution of the outputs.

In [None]:
df['label'].value_counts(normalize=True)

As discussed in the previous session, we need numeric values to use in the models. We use Tfidf representation of texts.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(lowercase=True, #this will convert all the tokens into lower case
                         stop_words='english', #remove english stopwords from vocabulary. if we need the stopwords this value should be None
                         analyzer='word', #tokens should be words. we can also use char for character tokens
                         max_features=50000, #maximum vocabulary size to restrict too many features
                         min_df = 5,
                         max_df = .6
                        )

tfidf_vectorized_corpus = tfidf_vector.fit_transform(df.clean_text)

In [None]:
tfidf_vectorized_corpus

In [None]:
print (tfidf_vectorized_corpus.shape)

We have total 818 texts (data points) and 3842 features (words) for the model. We will use Simple Logistic Regression, Naive Bayes and random forest classifier for our modelling. 

#### Logistic Regression

Logistic regression assumes a linear relationship among the features and predicts log-odd $\log{\frac{p}{(1-p)}}$ of $Y=1$.

#### Naive Bayes

In machine learning, naïve Bayes classifiers are a family of simple "probabilistic classifiers" based on applying Bayes' theorem with strong (naïve) independence assumptions between the features.

<img src=https://uc-r.github.io/public/images/analytics/naive_bayes/naive_bayes_icon.png>

#### Random Forest

Random forests are a collection of simple decision trees. Decision tree is a modelling technique that uses logical cummulation of decision rules to predict target from a set of features.

<img src=https://upload.wikimedia.org/wikipedia/commons/f/f3/CART_tree_titanic_survivors.png>

Random forests are based on ensemble methods, which uses averaging of multiple such decision trees. Each decision tree learns different types of decision rules. Individual decision trees are prone to overfitting. To reduce the variance, we use averaging of decision trees which lead to more robust model.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

For evaluation, we use cross validation. i.e. train on a part of data and testing on the remaining. We will use 5-fold cross validation. 

<img src=https://miro.medium.com/max/1710/1*rgba1BIOUys7wQcXcL4U5A.png width="500">

In [None]:
lg = LogisticRegression(multi_class='auto',solver='lbfgs')
cv_scores = cross_val_score(X=tfidf_vectorized_corpus,y=df.label,cv=5,estimator=lg)
print (cv_scores, np.mean(cv_scores),np.std(cv_scores))

In [None]:
cv_scores = cross_val_score(X=tfidf_vectorized_corpus,y=df.label,cv=StratifiedKFold(5,random_state=42,shuffle=True),estimator=lg)
print (cv_scores, np.mean(cv_scores),np.std(cv_scores))

In [None]:
nb = MultinomialNB()
cv_scores = cross_val_score(X=tfidf_vectorized_corpus,y=df.label,cv=5,estimator=nb)
print (cv_scores, np.mean(cv_scores),np.std(cv_scores))

NB and Logistic regression both achieve ~75% accuracy on cross validation dataset. As our dataset is not balanced, stratified sampling is better than random kfold. Similarly, f1 metric is better evaluation metric.

In [None]:
cv_scores = cross_val_score(X=tfidf_vectorized_corpus,y=df.label,cv=StratifiedKFold(5,random_state=42,shuffle=True),estimator=nb)
print (cv_scores, np.mean(cv_scores),np.std(cv_scores))

Naive Bayes performs same on stratified KFold, which shows the robustness of the model. Logistic regression performs pretty much similar to Naive Bayes.

Now let us use a simple RF classifier and see how does it perform on 5 fold cross validation.

In [None]:
model = RandomForestClassifier(n_estimators=101, random_state=42) #n_estimator is the parameter to control number of decision tress
cv_scores = cross_val_score(X=tfidf_vectorized_corpus,y=df.label,cv=5,estimator=model)
print (cv_scores, np.mean(cv_scores),np.std(cv_scores))

In [None]:
cv_scores = cross_val_score(X=tfidf_vectorized_corpus,y=df.label,cv=StratifiedKFold(5,random_state=42,shuffle=True),estimator=model)
print (cv_scores, np.mean(cv_scores),np.std(cv_scores))

RF achieves 69% macro F1 score on cross validation, much worse than Naive Bayes and Logistic Regression. Now let us use Logistic regrssion on a particular fold and interpret the results.

In [None]:
for train_idx, val_idx in StratifiedKFold(n_splits=5,random_state=42,shuffle=True).split(tfidf_vectorized_corpus,df.label.values):
    break

In [None]:
trainX = tfidf_vectorized_corpus[train_idx]
valX = tfidf_vectorized_corpus[val_idx]
trainy = df.label.values[train_idx]
valy = df.label.values[val_idx]

print (trainX.shape, valX.shape)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [None]:
lg = LogisticRegression(multi_class='auto',solver='lbfgs')
lg.fit(trainX,trainy)

val_train= lg.predict(trainX)
val_pred = lg.predict(valX)

In [None]:
print ("Accuracy score: {}".format(accuracy_score(trainy,val_train)))
print ("F1 score: {}".format(f1_score(trainy,val_train,average='macro')))

In [None]:
print ("Accuracy score: {}".format(accuracy_score(valy,val_pred)))
print ("F1 score: {}".format(f1_score(valy,val_pred,average='macro')))

In [None]:
def plot_cm(y_true, y_pred, labels, title):
    figsize=(14,10)
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(labels))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    plt.title(title)
    sns.heatmap(cm, cmap='viridis', annot=annot, fmt='', ax=ax)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#print (confusion_matrix(valy, val_pred,labels=model.classes_))
labels = lg.classes_
plot_cm(valy,val_pred,labels,'Confusion matrix: F1 {}'.format(f1_score(valy,val_pred,average='macro')))

From the above confusion matrix we can clearly see where our model performed good and where it requires improvment.

## Model interpretation

For interpretability, we must need to understand how our model has learned the task. We will use Eli5. It uses LIME (Local Interpretable Model-agnostic Explanation) technique. Similarly, another popular technique to explain ML models is SHAP (Shapley Additive Explanations).

In [None]:
import eli5

First let us see the top words corresponding to each speciality and whether they make any sense

In [None]:
eli5.show_weights(lg, vec=tfidf_vector, top=25)

In the above visualization we see that different words have different importance for different specialities. Now let us explain few test predictions. Eli5 highlights top positive words in yellow and top negative words in red.

In [None]:
df.iloc[val_idx[:3]]['label']

In [None]:
df.clean_text.values[val_idx[0]]

In [None]:
eli5.show_prediction(lg, doc=df.clean_text.values[val_idx[0]], vec=tfidf_vector, top=10)

In [None]:
eli5.show_prediction(lg, doc=df.clean_text.values[val_idx[1]], vec=tfidf_vector, top=10)

In [None]:
eli5.show_prediction(lg, doc=df.clean_text.values[val_idx[2]], vec=tfidf_vector, top=10)

This clearly shows the inside of our model. As we observed that our model performed poorly particularly for "neurology" and "radiology", we need to make our model robust on those classes. There are lots of different ways to increase model's performance. Here we discuss briefly about hyperparameter tuning and ensemble methods and how they can lead to better result.

### Hyperparameter tuning

Every model has a set of hyper parameters. By tuning different hyperparameters, we can increase model's performance. In this notebook, we tune different hyper parameters of random forest classifier.

* max_depth - maximum depth of each tree
* n_estimators - number of trees

Gridsearch or, randomsearch are used to tune hyperparameters and check oof (out of fold) score. In this notebook, we use hyperopt, a technique that use Bayesian Optimization to search for better hyperparameter.

In [None]:
from hyperopt import hp
from hyperopt import fmin, tpe, space_eval, Trials

In [None]:
def rf_cv(params, random_state=42, cv=5, X=tfidf_vectorized_corpus, y=df.label.values):
    # the function gets a set of variable parameters in "param"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth'])}
    
    # we use this params to create a new LGBM Regressor
    model = RandomForestClassifier(random_state=random_state, **params)
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X, y, cv=cv, scoring="f1_macro", n_jobs=-1).mean()

    return score


space = {'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
       'max_depth' : hp.quniform('max_depth', 2, 20, 1)
      }

# trials will contain logging information
trials = Trials()

best = fmin(fn=rf_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=10, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(42) # fixing random state for the reproducibility
         )

In [None]:
print("Best F1 {:.3f} params {}".format(-rf_cv(best), best))

### Ensemble

Ensemble is very useful technique to increase model's performance and reduce overfitting. By combining multiple models, we can make our prediction more robust, reduce overfitting and increase overall performance. We use max voting of multiple classifiers.

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
voting_classifier = VotingClassifier(estimators=[('rf',RandomForestClassifier(n_estimators=500,max_depth=19,random_state=42)),
                                     ('nb', MultinomialNB()),
                                     ('lg', LogisticRegression(multi_class='auto',solver='lbfgs'))], voting='hard')

cv_scores = cross_val_score(X=tfidf_vectorized_corpus,y=df.label,cv=StratifiedKFold(5,random_state=42),estimator=voting_classifier,scoring='f1_macro')
print (cv_scores, np.mean(cv_scores))

### Practical Tips

* Always use cross validation to check model performance. Understand the data to use - KFold or, StratifiedKFold, GroupKFold
* Start with simple model and gradually use more complex model
* For production, use scikit-learn's pipeline for E2E feature learning and modelling.
* Use joblib to persist model after training so that it can be called directly during inference
* Fix random seeds to avoid any randomization

### References

1. https://towardsdatascience.com/interpreting-your-deep-learning-model-by-shap-e69be2b47893

2. https://towardsdatascience.com/understanding-model-predictions-with-lime-a582fdff3a3b