In [None]:
# import nltk
# nltk.download('stopwords')

In [1]:
%matplotlib inline

import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
import seaborn as sns
from imblearn.over_sampling import SMOTE

In [2]:
df_train = pd.read_csv('train.tsv',sep='\t')
df_test = pd.read_csv('dev.tsv',sep='\t')

In [None]:
df_test = pd.read_csv('dev.tsv',sep='\t')

In [3]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,ids,body,Uncertainity of Post_Diagnosis,Results and Side-Effects Observed,Medical Assistance,Diet and Maintenance,Information Source,Concepts
0,0,LIPITOR.449.txt,Extreme tiredness and flatulence. Not sure whe...,0,1,0,0,0,flatulence|exhaustion|tired|Not sure|Extreme|t...
1,1,LIPITOR.188.txt,1/7/05-continued. not all of it posted before....,0,1,0,1,0,package insert|depression|Lipitor|cholesterol|...
2,2,LIPITOR.541.txt,So sad to see so many with problems like mine!...,1,1,1,0,0,muscle pain|joint pain|depression|Lipitor|Lipi...
3,3,LIPITOR.810.txt,Within 1 month time developed severe depressio...,0,1,0,0,0,severe depression|headaches|Lipitor|statins|Li...
4,4,LIPITOR.393.txt,I have been on lipitor for 10 years for heart ...,0,1,0,0,0,leg weakness|changed|experience|Potassium|cram...


In [4]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,ids,body,Uncertainity of Post_Diagnosis,Results and Side-Effects Observed,Medical Assistance,Diet and Maintenance,Information Source,Concepts
0,0,LIPITOR.595.txt,"Swelling left arm, very severe itching, intole...",0,1,0,0,0,Swelling|itching|left arm|hand|dreams|bruise|v...
1,1,ARTHROTEC.101.txt,"1st pill taken with food, a few hours after i ...",0,1,0,0,0,side effects|depression|cramping|upset|experie...
2,2,LIPITOR.367.txt,episode of intense dizziness lasting nearly an...,0,1,0,0,0,dizziness|lassitude|chills|shivers|problem|wor...
3,3,LIPITOR.74.txt,After taking Crestor and having muscle pain an...,0,1,0,0,1,liver problems|decreased|itchy|control|crawly|...
4,4,LIPITOR.389.txt,"75 yo mother-in-law has memory loss, hair loss...",0,1,0,1,0,lack of appetite|hair loss|stroke|sciatica|los...


In [5]:
df_train = df_train.drop(['Unnamed: 0','Concepts','ids'],axis = 1)

In [6]:
df_test = df_test.drop(['Unnamed: 0','Concepts','ids'],axis = 1)

In [7]:
df_train.head()

Unnamed: 0,body,Uncertainity of Post_Diagnosis,Results and Side-Effects Observed,Medical Assistance,Diet and Maintenance,Information Source
0,Extreme tiredness and flatulence. Not sure whe...,0,1,0,0,0
1,1/7/05-continued. not all of it posted before....,0,1,0,1,0
2,So sad to see so many with problems like mine!...,1,1,1,0,0
3,Within 1 month time developed severe depressio...,0,1,0,0,0
4,I have been on lipitor for 10 years for heart ...,0,1,0,0,0


In [8]:
df_test.head()

Unnamed: 0,body,Uncertainity of Post_Diagnosis,Results and Side-Effects Observed,Medical Assistance,Diet and Maintenance,Information Source
0,"Swelling left arm, very severe itching, intole...",0,1,0,0,0
1,"1st pill taken with food, a few hours after i ...",0,1,0,0,0
2,episode of intense dizziness lasting nearly an...,0,1,0,0,0
3,After taking Crestor and having muscle pain an...,0,1,0,0,1
4,"75 yo mother-in-law has memory loss, hair loss...",0,1,0,1,0


In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [10]:
categories = ['Uncertainity of Post_Diagnosis', 'Results and Side-Effects Observed', 'Medical Assistance', 'Diet and Maintenance', 'Information Source']

In [11]:
df_train['body'] = df_train['body'].map(lambda com : clean_text(com))
df_test['body'] = df_test['body'].map(lambda com : clean_text(com))

In [12]:
X_train = df_train.body
X_test = df_test.body
print(X_train.shape)
print(X_test.shape)

(942,)
(300,)


In [13]:
smt = SMOTE(random_state=42)

In [14]:
# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('smt',smt),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, df_train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(df_test[category], prediction)))
    print(classification_report(df_test[category],prediction))

... Processing Uncertainity of Post_Diagnosis
Test accuracy is 0.54
              precision    recall  f1-score   support

           0       0.94      0.49      0.65       257
           1       0.21      0.81      0.34        43

    accuracy                           0.54       300
   macro avg       0.58      0.65      0.49       300
weighted avg       0.84      0.54      0.60       300

... Processing Results and Side-Effects Observed
Test accuracy is 0.99
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.99      1.00      0.99       297

    accuracy                           0.99       300
   macro avg       0.75      0.66      0.70       300
weighted avg       0.99      0.99      0.99       300

... Processing Medical Assistance
Test accuracy is 0.59
              precision    recall  f1-score   support

           0       0.89      0.56      0.69       243
           1       0.27      0.70      0.3

In [15]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('smt',smt),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, df_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(df_test[category], prediction)))
    print(classification_report(df_test[category],prediction))

... Processing Uncertainity of Post_Diagnosis
Test accuracy is 0.8166666666666667
              precision    recall  f1-score   support

           0       0.88      0.91      0.89       257
           1       0.32      0.26      0.29        43

    accuracy                           0.82       300
   macro avg       0.60      0.58      0.59       300
weighted avg       0.80      0.82      0.81       300

... Processing Results and Side-Effects Observed
Test accuracy is 0.99
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.99      1.00      0.99       297

    accuracy                           0.99       300
   macro avg       0.49      0.50      0.50       300
weighted avg       0.98      0.99      0.99       300

... Processing Medical Assistance
Test accuracy is 0.8333333333333334
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       243
           1  

  _warn_prf(average, modifier, msg_start, len(result))


Test accuracy is 0.9133333333333333
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       264
           1       0.78      0.39      0.52        36

    accuracy                           0.91       300
   macro avg       0.85      0.69      0.74       300
weighted avg       0.90      0.91      0.90       300

... Processing Information Source
Test accuracy is 0.9233333333333333
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       276
           1       0.60      0.12      0.21        24

    accuracy                           0.92       300
   macro avg       0.76      0.56      0.58       300
weighted avg       0.90      0.92      0.90       300



In [17]:
from sklearn.metrics import confusion_matrix
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('smt',smt),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, df_train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(df_test[category], prediction)))
    print(classification_report(df_test[category],prediction))

... Processing Uncertainity of Post_Diagnosis
Test accuracy is 0.8033333333333333
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       257
           1       0.33      0.37      0.35        43

    accuracy                           0.80       300
   macro avg       0.61      0.62      0.62       300
weighted avg       0.81      0.80      0.81       300

... Processing Results and Side-Effects Observed
Test accuracy is 0.99
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.99      1.00      0.99       297

    accuracy                           0.99       300
   macro avg       0.49      0.50      0.50       300
weighted avg       0.98      0.99      0.99       300

... Processing Medical Assistance
Test accuracy is 0.8433333333333334
              precision    recall  f1-score   support

           0       0.88      0.93      0.91       243
           1  

  _warn_prf(average, modifier, msg_start, len(result))


Test accuracy is 0.8966666666666666
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       264
           1       0.62      0.36      0.46        36

    accuracy                           0.90       300
   macro avg       0.77      0.67      0.70       300
weighted avg       0.88      0.90      0.88       300

... Processing Information Source
Test accuracy is 0.92
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       276
           1       0.50      0.12      0.20        24

    accuracy                           0.92       300
   macro avg       0.71      0.56      0.58       300
weighted avg       0.89      0.92      0.90       300



In [18]:
from collections import Counter
for category in categories:
    count_d = Counter(df_train[category].to_list())
    print(category)
    for k,v in count_d.items():
        print(k,v)

Uncertainity of Post_Diagnosis
0 775
1 167
Results and Side-Effects Observed
1 933
0 9
Medical Assistance
0 742
1 200
Diet and Maintenance
0 834
1 108
Information Source
0 878
1 64


In [None]:
from collections import Counter
for category in categories:
    count_d = Counter(test[category].to_list())
    print(category)
    for k,v in count_d.items():
        print(k,v)