### Import Libraries


In [None]:
import plotly.express as px
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

## Data Preprocessing

In [7]:
# train data
df_train = pd.read_csv("./data/drugsComTest_raw.csv")

# test data
df_test = pd.read_csv("./data/drugsComTest_raw.csv")

In [8]:
# Removing nan values
df_train = df_train.dropna()
df_test = df_test.dropna()

In [9]:
df_train.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4


#### Show Class Distibution 

In [None]:
def plot_bar_chart(df):
    # analyze the condition labels
    counts_series = df.condition.value_counts()
    counts_df = pd.DataFrame(counts_series)
    counts_df.reset_index(level=0, inplace=True)

    number_of_classes(df)

    fig = px.bar(counts_df, x="index", y="condition", orientation='v',  # "index" Debugging-Aufgabe!
              height=400,
              title='xc')
    fig.show()

def number_of_classes(df):
    print("Number of classes: ", len(df["condition"].unique()))

plot_bar_chart(df_train)

Number of classes:  708


In [11]:
# Keeping classes which have more than 20 values in them
index_counts = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 20].index
df_train = df_train[df_train["condition"].isin(index_counts)]
number_of_classes(df_train)

Number of classes:  214


#### Balancing Classes
Undersampling all classes with samples greater than 200 to 200

In [12]:
# undersampling all classes with samples greater than 200 to 200
condition_over200 = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 200].index

for condition in condition_over200:
    # randomly shuffle the samples
    condition_samples = df_train[df_train["condition"]==condition]
    condition_samples = condition_samples.sample(frac=1).reset_index(drop=True)

    # extract only 200
    condition_samples = condition_samples[:200]

    df_train = df_train[df_train["condition"]!=condition]
    # put it back
    df_train = pd.concat([df_train, condition_samples], ignore_index=True)

### Filtering Labels and removing alpha numeric values.


In [13]:
def filter_labels(labels):
    labels = labels.tolist()
    labels_truth = []
    for label in labels:
        if label[0].isdigit():
            labels_truth.append(False)
        else:
            labels_truth.append(True)
    return labels_truth

In [14]:
df_train = df_train[filter_labels(df_train["condition"])]
print("Train ", number_of_classes(df_train))

Number of classes:  210
Train  None


In [15]:
df_test = df_test[filter_labels(df_test["condition"])]
print("Test ", number_of_classes(df_test))

Number of classes:  664
Test  None


### Only keeping the classes that are in the training set, in to the test set

In [16]:
df_test = df_test[df_test["condition"].isin(df_train["condition"])]
number_of_classes(df_test)

Number of classes:  210


### Revised Class Distribution

In [17]:
plot_bar_chart(df_train)

Number of classes:  210


## Reviews Preprocessing

### Removing Stepwords and stemming

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\snied\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Lower Casing Reviews

In [19]:
# combining drug name with review
# df_train["combined"] = df_train["drugName"].str.lower() + ": " + df_train["review"].str.lower()
# df_test["combined"] = df_test["drugName"].str.lower() + ": " + df_test["review"].str.lower()

df_train["review"] = df_train["review"].str.lower()
df_test["review"] = df_test["review"].str.lower()

In [20]:
import string

def filter_data(reviews):
  
  """
  Filter the corpus of training and testing df.
  This function removes stop and stem words from the corpus
  :param reviews:
  :return:
  """
  stop = stopwords.words('english')
  stemmer = SnowballStemmer("english")

  # remove punctuations
  series = reviews.str.replace('[{}]'.format(string.punctuation), '')

  # remove stop words
  series = series.apply(
      lambda x: ' '.join([word for word in x.split() if word not in stop]))

  # remove stem words
  series = series.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

  return series
    
df_train["review"] = filter_data(df_train["review"]).str.lower()
df_test["review"] = filter_data(df_test["review"]).str.lower()

### Lowercasing Classes

In [21]:
df_train["Label"] = df_train["condition"].str.lower()
df_test["Label"] = df_test["condition"].str.lower()

In [22]:
df_test["Label"]

0                          depression
1        crohn's disease, maintenance
2             urinary tract infection
3                         weight loss
4                       birth control
                     ...             
53760                   birth control
53762                         anxiety
53763                   birth control
53764                            pain
53765                        sciatica
Name: Label, Length: 51122, dtype: object

## Creating Model for Reviews Only

In [23]:
# shuffle the training dataframe and saving the columns in X and Y
df_train = df_train.sample(frac=1)
X = df_train['review']
Y = df_train['Label']

### Creating Count Vectorizer Object and transforming whole training dataset from it

In [24]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))
X_count_vec = count_vectorizer.fit_transform(X)

### Defining a function for K-Fold Cross Validation for multiple classifiers

In [25]:
def cross_val_multiple_classifiers(X, Y):
  classifiers = [MultinomialNB(), SGDClassifier(loss="modified_huber"), 
                 RandomForestClassifier(n_estimators=100), 
                 KNeighborsClassifier(n_neighbors=5)]
  labels = ['Multinomial Naive Bayes', 'SGD Classifier', 'Random Forest',  'KNN']
  clf_cv_mean = []
  clf_cv_std = []
  for clf, label in zip(classifiers, labels): 
      scores = cross_val_score(clf, X, Y, cv=4, scoring='accuracy')
      print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))

#### Calling K Fold validation of multiple classifiers on Count Vectorized Dataset

In [26]:
# calling multiple classifiers on the vectorized features
cross_val_multiple_classifiers(X_count_vec, Y)

Accuracy: 0.40 (+/- 0.00) [Multinomial Naive Bayes]
Accuracy: 0.46 (+/- 0.01) [SGD Classifier]
Accuracy: 0.53 (+/- 0.00) [Random Forest]
Accuracy: 0.05 (+/- 0.01) [KNN]


## Training the best model for getting results on the Test Set

In [27]:
# training the Random Forest Classifier on complete training data
fin_clf = RandomForestClassifier(n_estimators=100)
fin_clf.fit(X_count_vec, Y)

# transforming test_data with count vectorizer
X_test_vec = count_vectorizer.transform(df_test['review'])

# getting preds on the test data
preds = fin_clf.predict(X_test_vec)

In [28]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

print("Accuracy on test data: ", accuracy_score(df_test["Label"].str.lower(), 
                                                preds))
print("Macro F-1 Score on test data: ", f1_score(df_test["Label"].str.lower(), 
                                                preds, average="macro"))
print("Micro F-1 Score on test data: ", f1_score(df_test["Label"].str.lower(), 
                                                preds, average="micro"))

Accuracy on test data:  0.7039630687375298
Macro F-1 Score on test data:  0.8592374155000988
Micro F-1 Score on test data:  0.7039630687375298


## Production example

In [29]:
# example input
input = "I was lying all day long and had to drink several times a day"

# transforming input with count vectorizer
X_input_vec = count_vectorizer.transform([input,])

# getting preds on the test data
preds = fin_clf.predict(X_input_vec)

print("Diagnose: ", preds[0])

Diagnose:  alcohol dependence
