In [14]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
df = pd.read_csv("drive/MyDrive/overview-of-recordings.csv")

In [None]:
df_text = df[['phrase', 'prompt']]
df_text

Unnamed: 0,phrase,prompt
0,When I remember her I feel down,Emotional pain
1,When I carry heavy things I feel like breaking...,Hair falling out
2,there is too much pain when i move my arm,Heart hurts
3,My son had his lip pierced and it is swollen a...,Infected wound
4,My muscles in my lower back are aching,Infected wound
...,...,...
6656,I feel a burning sensation in my guts about 2 ...,Stomach ache
6657,I have a split on my thumb that will not heal.,Open wound
6658,I feel a lot of pain in the joints.,Joint pain
6659,The area around my heart doesn't feel good.,Heart hurts


In [None]:
df_text.rename(columns={'phrase':'text','prompt':'intent'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
from sklearn.model_selection import train_test_split
traindf,testdf = train_test_split(df_text, test_size=0.2)
validdf,testdf = train_test_split(traindf, test_size=0.5)

print(traindf.shape,validdf.shape, testdf.shape)

(5328, 2) (2664, 2) (2664, 2)


In [None]:
traindf.rename(columns={'phrase':'text','prompt':'intent'},inplace=True)

validdf.rename(columns={'phrase':'text','prompt':'intent'},inplace=True)

testdf.rename(columns={'phrase':'text','prompt':'intent'},inplace=True)

In [17]:
def print_plot(index):
    example = traindf[traindf.index == index][['text', 'intent']].values[0]
    if len(example) > 0:
        print(example[0])
        print('intent:', example[1])
print_plot(10)

nltk.download('stopwords')

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

I have a sharp pain in my lower stomach.
intent: Stomach ache


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
traindf['text'] = traindf['text'].apply(clean_text)
testdf['text'] = testdf['text'].apply(clean_text)
print_plot(10)

sharp pain lower stomach
intent: Stomach ache


In [19]:
traindf['text'].apply(lambda x: len(x.split(' '))).sum()

X_train = traindf['text']
Y_train = traindf['intent']
X_test = testdf['text']
Y_test = testdf['intent']

In [20]:
# using naive-bayes classifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, Y_train)


from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, Y_test))
print(classification_report(Y_test, y_pred))


accuracy 0.984984984984985
                    precision    recall  f1-score   support

              Acne       1.00      1.00      1.00       138
         Back pain       0.99      1.00      0.99        97
     Blurry vision       1.00      1.00      1.00       105
   Body feels weak       1.00      1.00      1.00        95
             Cough       1.00      1.00      1.00       112
          Ear ache       1.00      1.00      1.00       105
    Emotional pain       0.99      1.00      1.00       100
      Feeling cold       1.00      1.00      1.00       100
     Feeling dizzy       1.00      1.00      1.00        94
         Foot ache       1.00      0.99      0.99        96
  Hair falling out       1.00      0.99      1.00       105
    Hard to breath       1.00      1.00      1.00       112
         Head ache       0.95      1.00      0.98       101
       Heart hurts       0.93      0.99      0.96       113
    Infected wound       1.00      1.00      1.00       122
Injury from 

In [21]:
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

model1 = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd = CalibratedClassifierCV(model1)


sgd.fit(X_train, Y_train)



y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, Y_test))
print(classification_report(Y_test, y_pred))


accuracy 0.9887387387387387
                    precision    recall  f1-score   support

              Acne       1.00      1.00      1.00       138
         Back pain       0.99      1.00      0.99        97
     Blurry vision       1.00      1.00      1.00       105
   Body feels weak       1.00      1.00      1.00        95
             Cough       1.00      1.00      1.00       112
          Ear ache       1.00      1.00      1.00       105
    Emotional pain       0.99      0.96      0.97       100
      Feeling cold       1.00      1.00      1.00       100
     Feeling dizzy       1.00      1.00      1.00        94
         Foot ache       1.00      0.99      0.99        96
  Hair falling out       1.00      0.99      1.00       105
    Hard to breath       1.00      1.00      1.00       112
         Head ache       0.99      1.00      1.00       101
       Heart hurts       0.97      0.99      0.98       113
    Infected wound       1.00      1.00      1.00       122
Injury from

In [22]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, Y_train)

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, Y_test))
print(classification_report(Y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy 0.996996996996997
                    precision    recall  f1-score   support

              Acne       1.00      1.00      1.00       138
         Back pain       0.99      1.00      0.99        97
     Blurry vision       1.00      1.00      1.00       105
   Body feels weak       1.00      1.00      1.00        95
             Cough       1.00      1.00      1.00       112
          Ear ache       1.00      1.00      1.00       105
    Emotional pain       1.00      1.00      1.00       100
      Feeling cold       1.00      1.00      1.00       100
     Feeling dizzy       1.00      1.00      1.00        94
         Foot ache       1.00      0.99      0.99        96
  Hair falling out       1.00      0.99      1.00       105
    Hard to breath       1.00      1.00      1.00       112
         Head ache       1.00      1.00      1.00       101
       Heart hurts       1.00      0.99      1.00       113
    Infected wound       1.00      1.00      1.00       122
Injury from 

In [23]:
from sklearn.ensemble import VotingClassifier

combined_model = VotingClassifier(estimators=[('nb', nb), ('sgd', sgd), ('logreg', logreg)],voting='soft', weights=[2, 2, 1])

combined_model.fit(X_train,Y_train)

y_pred = combined_model.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, Y_test))
print(classification_report(Y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy 0.9928678678678678
                    precision    recall  f1-score   support

              Acne       1.00      1.00      1.00       138
         Back pain       0.99      1.00      0.99        97
     Blurry vision       1.00      1.00      1.00       105
   Body feels weak       1.00      1.00      1.00        95
             Cough       1.00      1.00      1.00       112
          Ear ache       1.00      1.00      1.00       105
    Emotional pain       0.99      0.96      0.97       100
      Feeling cold       1.00      1.00      1.00       100
     Feeling dizzy       1.00      1.00      1.00        94
         Foot ache       1.00      0.99      0.99        96
  Hair falling out       1.00      0.99      1.00       105
    Hard to breath       1.00      1.00      1.00       112
         Head ache       0.99      1.00      1.00       101
       Heart hurts       0.97      0.99      0.98       113
    Infected wound       1.00      1.00      1.00       122
Injury from