# Exercise - Text Mining - Classification - SCIKIT-LEARN

We will predict the smoker status of a patient based on that patient's note (recorded by a doctor)

**The unit of analysis is a patient note**

In [1]:
import pandas as pd
import numpy as np

In [2]:
notes = pd.read_csv('smokers.csv')

In [3]:
notes.head(5)

Unnamed: 0,ID,TEXT,STATUS
0,641,977146916\nHLGMC\n2878891\n022690\n01/27/1997 ...,CURRENT SMOKER
1,643,026738007\nCMC\n15319689\n3/25/1998 12:00:00 A...,CURRENT SMOKER
2,681,071962960\nBH\n4236518\n417454\n12/10/2001 12:...,CURRENT SMOKER
3,704,418520250\nNVH\n61562872\n3/11/1995 12:00:00 A...,CURRENT SMOKER
4,757,301443520\nCTMC\n49020928\n448922\n1/11/1990 1...,CURRENT SMOKER


In [4]:
notes.shape

(398, 3)

## Assign the "target" variable



In [5]:
target = notes['STATUS']

## Assign the "text" (input) variable

In [6]:
input_data = notes['TEXT']

## Split the data

In [7]:
from sklearn.model_selection import train_test_split

train_set, test_set, train_y, test_y = train_test_split(input_data, target, test_size=0.3, random_state=42)

In [8]:
train_set.shape, train_y.shape

((278,), (278,))

In [9]:
test_set.shape, test_y.shape

((120,), (120,))

In [10]:
# Baselines
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_set, train_y)

In [11]:
from sklearn.metrics import accuracy_score
dummy_train_pred = dummy_clf.predict(train_set)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.6510791366906474


In [12]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_set)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5916666666666667


## Sklearn: Text preparation


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(stop_words='english',decode_error='replace')

train_x_tr = tfidf_vect.fit_transform(train_set)

In [14]:
test_x_tr = tfidf_vect.transform(test_set)

In [15]:
train_x_tr.shape, test_x_tr.shape

((278, 12240), (120, 12240))

In [16]:
train_x_tr.toarray()

array([[0.0095233 , 0.01267037, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02616653, 0.03481352, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01448694, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.03956983, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06738015, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02353552, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Latent Semantic Analysis (Singular Value Decomposition)

In [17]:
from sklearn.decomposition import TruncatedSVD

### Don't forget to create SVDs for both train and test

In [18]:
svd = TruncatedSVD(n_components=600, n_iter=20)

In [19]:
train_x_lsa = svd.fit_transform(train_x_tr)

In [20]:
train_x_lsa.shape

(278, 278)

In [21]:
train_x_lsa

array([[ 2.96692177e-01, -6.95753887e-02, -2.29544427e-02, ...,
         2.39689773e-04,  1.88918253e-05, -1.14582509e-03],
       [ 2.52213194e-01,  2.21258004e-02,  2.71268979e-02, ...,
         3.15188008e-03,  4.46321684e-03,  1.03471317e-02],
       [ 4.23545576e-01, -5.25166916e-02, -2.27631660e-02, ...,
        -5.11187395e-03,  7.82821053e-03,  2.65989855e-03],
       ...,
       [ 3.11761211e-01, -3.46701020e-03,  6.48129711e-02, ...,
        -6.86543535e-04, -8.08771274e-04, -2.32709631e-03],
       [ 2.61020664e-01,  1.71355498e-01,  3.40140476e-01, ...,
         1.75019586e-02, -9.42464230e-03,  9.01793438e-03],
       [ 3.51938483e-01, -8.24162551e-02, -4.96559201e-02, ...,
         4.17369461e-05, -2.92334947e-03,  1.19323823e-03]])

In [22]:
test_x_lsa = svd.transform(test_x_tr)

In [23]:
test_x_lsa.shape

(120, 278)

In [24]:
test_x_lsa

array([[ 3.87016279e-01, -6.20397254e-02, -1.76673755e-02, ...,
         1.45196467e-02,  3.06310454e-04,  9.37818066e-03],
       [ 2.55851253e-01,  8.74846418e-02,  1.23435084e-01, ...,
         6.85143146e-03, -1.10123384e-02,  2.41938939e-02],
       [ 3.89625648e-01, -1.15775421e-01, -2.11274010e-02, ...,
        -1.18766661e-03, -4.54791328e-03, -5.21173499e-03],
       ...,
       [ 3.60860950e-01, -3.15050987e-03, -1.35648159e-02, ...,
        -2.68506000e-03, -2.56039940e-02,  2.35559982e-04],
       [ 3.96770522e-01, -3.48628650e-02,  2.02309328e-02, ...,
        -7.64022387e-03,  5.82048091e-03, -5.88265346e-03],
       [ 3.01302438e-01,  5.39115757e-03,  9.50245299e-04, ...,
        -8.23415263e-03, -3.46906903e-03, -7.75975054e-03]])

## Check for the cumulative variance explained

**Increase the number of components if it the cumulative variance is low.**

In [25]:
svd.explained_variance_ratio_.sum()

1.0000000000000002

# Try one of the classifiers we have covered so far

In [26]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000,early_stopping=True,penalty='elasticnet',l1_ratio=.25)

In [27]:
sgd_clf.fit(train_x_lsa, train_y)

## Accuracy

In [28]:
#Train accuracy

train_y_pred = sgd_clf.predict(train_x_lsa)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.9640287769784173


In [29]:
#Test accuracy

test_y_pred = sgd_clf.predict(test_x_lsa)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))


Test acc: 0.5583333333333333


## Generate the confusion matrix

In [30]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, test_y_pred)

array([[ 0,  5,  0,  0,  5],
       [ 0,  7,  1,  0, 15],
       [ 0,  5,  1,  0,  9],
       [ 0,  0,  0,  0,  1],
       [ 1, 10,  1,  0, 59]], dtype=int64)

In [31]:
# This is not pretty, I can't seem to get a better accuracy and this is the smaller overfitting - Worse than the baseline