### Therapy chatbot - Intent classification

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vlekkala\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vlekkala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#!wget -O Sheet_1.csv https://www.dropbox.com/s/2h4bggnpsze0rkf/Sheet_1.csv --no-check-certificate

In [5]:
df = pd.read_csv('Sheet_1.csv')
df.head()

Unnamed: 0,response_id,class,response_text,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,response_1,not_flagged,I try and avoid this sort of conflict,,,,,
1,response_2,flagged,Had a friend open up to me about his mental ad...,,,,,
2,response_3,flagged,I saved a girl from suicide once. She was goin...,,,,,
3,response_4,not_flagged,i cant think of one really...i think i may hav...,,,,,
4,response_5,not_flagged,Only really one friend who doesn't fit into th...,,,,,


In [6]:
df = df.drop(columns=['response_id', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7'])
df.head()

Unnamed: 0,class,response_text
0,not_flagged,I try and avoid this sort of conflict
1,flagged,Had a friend open up to me about his mental ad...
2,flagged,I saved a girl from suicide once. She was goin...
3,not_flagged,i cant think of one really...i think i may hav...
4,not_flagged,Only really one friend who doesn't fit into th...


In [7]:
# Load and analyze the dataset
df.shape

(80, 2)

In [8]:
df.isna().sum()

class            0
response_text    0
dtype: int64

In [9]:
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])
df.head()

Unnamed: 0,class,response_text
0,1,I try and avoid this sort of conflict
1,0,Had a friend open up to me about his mental ad...
2,0,I saved a girl from suicide once. She was goin...
3,1,i cant think of one really...i think i may hav...
4,1,Only really one friend who doesn't fit into th...


In [17]:
from nltk.corpus import stopwords
import string
from nltk import WordNetLemmatizer

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def clean_text(sentence):
    sentence = sentence.lower()
    words = word_tokenize(sentence)
    tokens = [lemmatizer.lemmatize(word) for word in words 
             if word not in stop_words and word not in string.punctuation]
    return " ".join(tokens)

In [19]:
df['cleaned_text'] = df['response_text'].apply(clean_text)
df.head()

Unnamed: 0,class,response_text,cleaned_text
0,1,I try and avoid this sort of conflict,try avoid sort conflict
1,0,Had a friend open up to me about his mental ad...,friend open mental addiction weed taking life ...
2,0,I saved a girl from suicide once. She was goin...,saved girl suicide going swallow bunch pill ta...
3,1,i cant think of one really...i think i may hav...,cant think one really ... think may indirectly
4,1,Only really one friend who doesn't fit into th...,really one friend n't fit category therapist c...


In [36]:
tfidf = TfidfVectorizer(lowercase=True)
model = MultinomialNB()

X = df['cleaned_text']
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

clf = Pipeline([('tfidf', tfidf), ('model', model)])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.71      1.00      0.83        17

    accuracy                           0.71        24
   macro avg       0.35      0.50      0.41        24
weighted avg       0.50      0.71      0.59        24



  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
df['class'].value_counts()

1    55
0    25
Name: class, dtype: int64

In [None]:
k_fold = KFold(c)

In [30]:
help(KFold)

Help on class KFold in module sklearn.model_selection._split:

class KFold(_BaseKFold)
 |  KFold(n_splits=5, *, shuffle=False, random_state=None)
 |  
 |  K-Folds cross-validator
 |  
 |  Provides train/test indices to split data in train/test sets. Split
 |  dataset into k consecutive folds (without shuffling by default).
 |  
 |  Each fold is then used once as a validation while the k - 1 remaining
 |  folds form the training set.
 |  
 |  Read more in the :ref:`User Guide <cross_validation>`.
 |  
 |  Parameters
 |  ----------
 |  n_splits : int, default=5
 |      Number of folds. Must be at least 2.
 |  
 |      .. versionchanged:: 0.22
 |          ``n_splits`` default value changed from 3 to 5.
 |  
 |  shuffle : bool, default=False
 |      Whether to shuffle the data before splitting into batches.
 |      Note that the samples within each split will not be shuffled.
 |  
 |  random_state : int or RandomState instance, default=None
 |      When `shuffle` is True, `random_state` af

In [31]:
from sklearn.model_selection import cross_validate
help(cross_validate)

Help on function cross_validate in module sklearn.model_selection._validation:

cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score=False, return_estimator=False, error_score=nan)
    Evaluate metric(s) by cross-validation and also record fit/score times.
    
    Read more in the :ref:`User Guide <multimetric_cross_validation>`.
    
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    
    X : array-like of shape (n_samples, n_features)
        The data to fit. Can be for example a list, or an array.
    
    y : array-like of shape (n_samples,) or (n_samples, n_outputs),             default=None
        The target variable to try to predict in the case of
        supervised learning.
    
    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitt

In [35]:
cross_validate(clf, X, y, cv=5, scoring='accuracy')

{'fit_time': array([0.00608587, 0.01112747, 0.00500536, 0.00500083, 0.00602865]),
 'score_time': array([0.        , 0.00199962, 0.00100112, 0.00199771, 0.00199795]),
 'test_score': array([0.6875, 0.6875, 0.6875, 0.6875, 0.6875])}