In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
import pandas as pd
import re
import requests
import numpy as np
from bs4 import BeautifulSoup

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

import spacy
#I'll use spacy as it seemed like a good option to lemmatize with the appropriate pos tag, detects pronouns and superlative 
#forms of words.
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

stop_words = list(stopwords.words('english'))
stop_words.append('-PRON-') #remove it as it won't be necessary in our model
stop_words.append('p')
stop_words.append('nbsp') #this is the '<p>' and &nbsp from the html conversion

# 1. Import and Clean TherapyBotSession

User's input is tagged as 'escalate' (1) or 'do not escalate' (0).

In [3]:
#Read the file:
therapy_bot_session = pd.read_csv('../data/therapybotsession.txt')

#Remove unnecessary columns, get label as numerical:
therapy_bot_session.drop(['id','Unnamed: 3','Unnamed: 4','Unnamed: 5','Unnamed: 6'], axis=1, inplace=True)
therapy_bot_session.label.replace({'escalate':1, 'do_not_escalate':0}, inplace=True)

#Check missing values:
therapy_bot_session.info() #No missing values here

#Clean the text:
def clean_chat(s):
    s = s.lower()                   #remove caps to avoid double words
    s = re.sub('[\W\d]', ' ', s)    #remove special signs, punctuation, numbers
    s = re.sub(' +', ' ', s)        #remove excessive spaces
    s = s.strip()                   #remove first and last spaces
    return s

therapy_bot_session.chat = therapy_bot_session.chat.apply(clean_chat)

#Lemmatize and Tokenize the text:
"""This dataset is pretty short, I don't wanna loose too many words so i'll just lemmatize to make sure that the words I keep
are still real words."""

#therapy_bot_session.chat = therapy_bot_session.chat.apply(lambda x: [lemmatizer.lemmatize(word) for word in x]) I need POS!!
def spacy_lem(l):
    doc = nlp(l)
    return [token.lemma_ for token in doc]

therapy_bot_session.chat = therapy_bot_session.chat.apply(spacy_lem)

#Remove all stop words:

therapy_bot_session.chat = therapy_bot_session.chat.apply(lambda x: [word for word in x if word not in stop_words])

therapy_bot_session.to_csv('../data/therapy_bot_session_clean.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   100 non-null    int64 
 1   chat    100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


# 2. Get Data from CounselChat
Online conversations between a client and a therapist, categorised by topic.

In [4]:
chat1 = pd.read_csv('../data/counsel-chat.txt')
chat2 = pd.read_csv('../data/scrap-counsel-chat.txt') 
# 2 datasets from the same website, let's try to merge them.

In [5]:
chat1.head()

Unnamed: 0,questionID,questionTitle,questionText,questionUrl,topics,therapistName,therapistUrl,answerText,upvotes
0,5566fab2a64752d71ec3ca69,Escalating disagreements between mother and wife,My wife and mother are having tense disagreeme...,https://counselchat.com/questions/escalating-d...,Family Conflict,"Kristi King-Morgan, LMSW",https://counselchat.com/therapists/kristi-king...,<p>What you are describing is something psycho...,0
1,5566f94fa64752d71ec3ca64,I'm addicted to smoking. How can I stop?,"I'm planning to have baby, so I have to quit s...",https://counselchat.com/questions/i-m-addicted...,"Substance Abuse,Addiction",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi. Good for you in planning ahead to do wh...,0
2,5567d26887a1cc0c3f3d8f46,Keeping secrets from my family,"I have secrets in my mind, and I don't know wh...",https://counselchat.com/questions/keeping-secr...,Family Conflict,Jeevna Bajaj,https://counselchat.com/therapists/jeevna-bajaj,<p>It sounds like keeping the secrets has beco...,0
3,556bed15c969ba5861709df5,The Underlying Causes of Being Possessive,I am extremely possessive in my relationships ...,https://counselchat.com/questions/the-underlyi...,"Behavioral Change,Social Relationships",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi there. It's great you are able to realiz...,0
4,556ba115c969ba5861709de6,Can I control anxiety without medication?,I had a head injury a few years ago and my min...,https://counselchat.com/questions/can-i-contro...,Anxiety,Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>You didn't say what or how many medications...,0


In [6]:
chat2.head()

Unnamed: 0.1,Unnamed: 0,questionID,questionTitle,questionText,questionLink,topic,therapistInfo,therapistURL,answerText,upvotes,views,split
0,0,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Sherry Katz, LCSWCouples and Family Therapist,...",https://counselchat.com/therapists/sherry-katz...,"If everyone thinks you're worthless, then mayb...",1,2899,train
1,1,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Robin Landwehr, DBH, LPCC, NCCMental Health in...",https://counselchat.com/therapists/robin-landw...,"Hello, and thank you for your question and see...",1,3514,train
2,2,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,Lee KingI use an integrative approach to treat...,https://counselchat.com/therapists/lee-king,First thing I'd suggest is getting the sleep y...,0,5,train
3,3,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Shauntai Davis-YearginPersonalized, private on...",https://counselchat.com/therapists/shauntai-da...,Therapy is essential for those that are feelin...,0,31,train
4,4,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,Jordan WhiteLicensed Social Worker at Oak Root...,https://counselchat.com/therapists/jordan-white,I first want to let you know that you are not ...,0,620,train


In [7]:
#The datasets are similar, let's drop the unnecessary columns then merge them

chat1.drop(['questionID','questionUrl','topics','therapistName','therapistUrl','upvotes'], axis=1, inplace=True)
chat2.drop(['Unnamed: 0','questionID','questionLink','topic','therapistInfo','therapistURL','upvotes','views','split'], axis=1, inplace=True)
#Ok topics are important, but we'll use them later, in a different way, i'd rather make different datasets.

#chat1.columns == chat2.columns

counsel = pd.concat([chat1, chat2])
counsel

Unnamed: 0,questionTitle,questionText,answerText
0,Escalating disagreements between mother and wife,My wife and mother are having tense disagreeme...,<p>What you are describing is something psycho...
1,I'm addicted to smoking. How can I stop?,"I'm planning to have baby, so I have to quit s...",<p>Hi. Good for you in planning ahead to do wh...
2,Keeping secrets from my family,"I have secrets in my mind, and I don't know wh...",<p>It sounds like keeping the secrets has beco...
3,The Underlying Causes of Being Possessive,I am extremely possessive in my relationships ...,<p>Hi there. It's great you are able to realiz...
4,Can I control anxiety without medication?,I had a head injury a few years ago and my min...,<p>You didn't say what or how many medications...
...,...,...,...
2124,What happens in a counseling session?,"After first meeting the client, what is the pr...",There are probably no two therapists alike bec...
2125,What happens in a counseling session?,"After first meeting the client, what is the pr...","Each counselor may have a different process, s..."
2126,What happens in a counseling session?,"After first meeting the client, what is the pr...","After meeting a client, many Counselors will a..."
2127,What happens in a counseling session?,"After first meeting the client, what is the pr...",A good therapist will discuss what brought you...


Ok so now that we have our merge data counsel, let's see how we can correct it before tokenize the text and train our chatbot with it.

1. First some questions only have a title, let's merge question title and text into a single question column.

2. We see that some questions had different answers, let's group by questions and gather the answers. They come from real therapists and all answer the same subject. This might reduce a lot the quantity of data though..

2. Then some answer still have an html format, let's remove all < text >
    


In [8]:
counsel['question'] = counsel.questionTitle.fillna('') +' '+ counsel.questionText.fillna('')
counsel.drop(['questionTitle', 'questionText'], axis=1, inplace=True)

In [9]:
counsel = counsel.groupby('question', as_index=False, sort=False).agg(lambda row: ' '.join(row))

In [10]:
counsel.answerText = counsel.answerText.str.replace('<\w+>', '', regex=True)
counsel

Unnamed: 0,question,answerText
0,Escalating disagreements between mother and wi...,What you are describing is something psycholog...
1,I'm addicted to smoking. How can I stop? I'm p...,Hi. Good for you in planning ahead to do what'...
2,Keeping secrets from my family I have secrets ...,It sounds like keeping the secrets has become ...
3,The Underlying Causes of Being Possessive I am...,Hi there. It's great you are able to realize t...
4,Can I control anxiety without medication? I ha...,You didn't say what or how many medications yo...
...,...,...
1165,How do therapists characterize personality typ...,"This is a great question, and I am happy to ha..."
1166,Why can't I stop these thoughts? I keep having...,I think we all go through a period of time whe...
1167,I think I empathize too much I empathize so mu...,Sometimes it's helps to have a name for a prob...
1168,How would I know if I have the right therapist...,Finding the right therapist for you is very im...


# 3. Clean, Tokenize Counsel

In [11]:
counsel.question = counsel.question.apply(clean_chat)
counsel.answerText = counsel.answerText.apply(clean_chat)

counsel.question = counsel.question.apply(spacy_lem)
counsel.answerText = counsel.answerText.apply(spacy_lem)

counsel

Unnamed: 0,question,answerText
0,"[escalate, disagreement, between, mother, and,...","[what, -PRON-, be, describe, be, something, ps..."
1,"[i, m, addict, to, smoke, how, can, i, stop, i...","[hi, good, for, -PRON-, in, plan, ahead, to, d..."
2,"[keep, secret, from, -PRON-, family, i, have, ...","[-PRON-, sound, like, keep, the, secret, have,..."
3,"[the, underlie, cause, of, be, possessive, i, ...","[hi, there, -PRON-, s, great, -PRON-, be, able..."
4,"[can, i, control, anxiety, without, medication...","[-PRON-, didn, t, say, what, or, how, many, me..."
...,...,...
1165,"[how, do, therapist, characterize, personality...","[this, be, a, great, question, and, i, be, hap..."
1166,"[why, can, t, i, stop, these, thought, i, keep...","[i, think, -PRON-, all, go, through, a, period..."
1167,"[i, think, i, empathize, too, much, i, empathi...","[sometimes, -PRON-, s, help, to, have, a, name..."
1168,"[how, would, i, know, if, i, have, the, right,...","[find, the, right, therapist, for, -PRON-, be,..."


In [12]:
counsel.question = counsel.question.apply(lambda x: [word for word in x if word not in stop_words])
counsel.answerText = counsel.answerText.apply(lambda x: [word for word in x if word not in stop_words])
#we already see some dupplicates in the pairs, let's keep only the unique words.

counsel.question = counsel.question.apply(np.unique)
counsel.answerText = counsel.answerText.apply(np.unique)
counsel.to_csv('../data/counsel_clean.txt', index=False)

In [13]:
'''#Vectorize:
"""all_words = nltk.FreqDist(bow)
#Get 5000 most common words
word_tuples = all_words.most_common(5000)
word_features = [x[0] for x in word_tuples]
#word_tuples
#word_features"""

word_question = list(counsel.question)
bow_question = [word for lst in word_question for word in lst]
#len(bow_question)
word_answer = list(counsel.answerText)
bow_answer = [word for lst in word_answer for word in lst]
len(bow_answer)
# Answer are way longer than questions, but that's because I group the data by question.'''




'#Vectorize:\n"""all_words = nltk.FreqDist(bow)\n#Get 5000 most common words\nword_tuples = all_words.most_common(5000)\nword_features = [x[0] for x in word_tuples]\n#word_tuples\n#word_features"""\n\nword_question = list(counsel.question)\nbow_question = [word for lst in word_question for word in lst]\n#len(bow_question)\nword_answer = list(counsel.answerText)\nbow_answer = [word for lst in word_answer for word in lst]\nlen(bow_answer)\n# Answer are way longer than questions, but that\'s because I group the data by question.'

# Data for Classifier

In [32]:
chatc1 = pd.read_csv('../data/counsel-chat.txt')
chatc2 = pd.read_csv('../data/scrap-counsel-chat.txt') 

chatc1.drop(['questionID','questionUrl','therapistName','therapistUrl','upvotes'], axis=1, inplace=True)
chatc2.drop(['Unnamed: 0','questionID','questionLink','therapistInfo','therapistURL','upvotes','views','split'], axis=1, inplace=True)

chatc1.rename(columns={'topics':'topic'}, inplace=True)

counsel_cat = pd.concat([chatc1, chatc2])

counsel_cat['sentence'] = counsel_cat.questionTitle.fillna('') +' '+ counsel_cat.questionText.fillna('') +' '+ counsel_cat.answerText.fillna('')
counsel_cat.drop(['questionTitle', 'questionText', 'answerText'], axis=1, inplace=True)

counsel_cat.topic = counsel_cat.topic.apply(lambda x: str(x).lower())
counsel_cat.topic = counsel_cat.topic.apply(lambda x: str(x).strip())

counsel_cat.replace('nan', np.nan, inplace=True)
counsel_cat.dropna(inplace=True)

counsel_cat.replace('-', ' ', regex=True, inplace=True)

topic = counsel_cat.topic.unique()
topic = [str(word).split(',') for word in topic]
topic = [word for lst in topic for word in lst]
#Ok that's a lot of topics, let's try to reduce them not by deleting but make some topics more general
topic = np.unique(topic)

"""
- Substance abuse and Addiction are related topics -> ADDICTION,
- Anxiety and stress are realted topics -> STRESS,
- Relationships, social relationships, relationship dissolution, Marriage are related topics -> RELATIONSHIPS,
- Children and Adolescents, Family Conflict, Parenting, Alzheimer's are related topics -> FAMILY,
- Career Counseling, Professional Ethics, Workplace Relationship are related topics -> WORKPLACE,
- Human Sexuality and Intimacy -> SEXUALITY
- Counseling fundamentals, Legal & Regulatory, Military Issues and Diagnosis -> COUNSELING
- Behaviorall change, anger management -> BEHAVIOR

'SPIRITUALITY' = 1
'COUNSELING' = 2

'WORKPLACE' = 3
'FAMILY' = 4
'RELATIONSHIPS' = 5
'SLEEP' = 6
'BEHAVIOR' = 7
'SEXUALITY' = 8
'SELF_ESTEEM' = 9
'GRIEF' = 10
'TRAUMA' = 11

'STRESS' = 12
'EATING_DISORDERS' = 13
'ADDICTION' = 14
'DEPRESSION' = 15
'LGBTQ' = 16

'DOMESTIC_VIOLENCE' = 17
'SELF_HARM' = 18
"""

topic_dict = {'addiction':'14',
              "alzheimer's":'4',
              'anger management':'7',
              'anxiety':'12',
              'behavioral change':'7',
              'career counseling':'3',
              'children & adolescents':'4',
              'children adolescents':'4',
              'counseling fundamentals':'2',
              'depression':'15',
              'diagnosis':'2',
              'domestic violence':'17',
              'eating disorders':'13',
              'family conflict':'4',
              'grief and loss':'10',
              'human sexuality':'8',
              'intimacy':'8',
              'lgbtq':'16',
              'legal & regulatory':'2',
              'legal regulatory':'2',
              'marriage':'5',
              'military issues':'2',
              'parenting':'4',
              'professional ethics':'3',
              'dissolution':'5',
              'relationships':'5',
              'relationship':'5',
              'self esteem':'9',
              'self harm':'18',
              'sleep improvement':'6',
              'social':'5',
              'spirituality':'1',
              'stress':'12',
              'substance abuse':'14',
              'trauma':'11',
              'workplace':'3'}

for key in topic_dict:
    counsel_cat.topic = counsel_cat.topic.str.replace(key, topic_dict[key], regex=False)

#topic_red = counsel_cat.topic.unique()

def sort_topics(s):
    s = str(s)
    s = re.findall('\d+', s)
    s = [int(number) for number in s]
    return max(s)  

#sort_topics(counsel_cat.topic[1])

counsel_cat.topic = counsel_cat.topic.apply(sort_topics)

def explode_text(s):
    s = re.split(r'[.?]\s*', s)
    return s

counsel_cat.sentence = counsel_cat.sentence.apply(explode_text)

counsel_topic = counsel_cat.explode('sentence')

counsel_topic

Unnamed: 0,topic,sentence
0,4,Escalating disagreements between mother and wi...
0,4,"In the past, they’ve had minor differences"
0,4,"For example, my wife would complain to me my m..."
0,4,"However, it’s intensified lately"
0,4,I think the cause is my wife talked back to he...
...,...,...
2128,2,The therapist is trained to listen for your em...
2128,2,And to open these up to you in a kind and safe...
2128,2,The therapist and you will refine your thinkin...
2128,2,The therapist will ask questions to help you p...


In [33]:
counsel_topic.sentence = counsel_topic.sentence.apply(clean_chat)
counsel_topic.sentence = counsel_topic.sentence.apply(spacy_lem)

counsel_topic

Unnamed: 0,topic,sentence
0,4,"[escalate, disagreement, between, mother, and,..."
0,4,"[in, the, past, -PRON-, ve, have, minor, diffe..."
0,4,"[for, example, -PRON-, wife, would, complain, ..."
0,4,"[however, -PRON-, s, intensify, lately]"
0,4,"[i, think, the, cause, be, -PRON-, wife, talk,..."
...,...,...
2128,2,"[the, therapist, be, train, to, listen, for, -..."
2128,2,"[and, to, open, these, up, to, -PRON-, in, a, ..."
2128,2,"[the, therapist, and, -PRON-, will, refine, -P..."
2128,2,"[the, therapist, will, ask, question, to, help..."


In [34]:
counsel_topic.sentence = counsel_topic.sentence.apply(lambda x: np.unique([word for word in x if word not in stop_words]))
counsel_topic.head()

Unnamed: 0,topic,sentence
0,4,"[disagreement, escalate, mother, tense, wife]"
0,4,"[difference, minor, past]"
0,4,"[complain, example, lazy, mother, overbearing,..."
0,4,"[however, intensify, lately]"
0,4,"[back, cause, talk, think, wife]"


In [35]:
all_sentence = list(counsel_topic.sentence)
bow_sentence = [word for lst in all_sentence for word in lst]
all_words = nltk.FreqDist(bow_sentence)

#Get 3000 most common words
word_tuples = all_words.most_common(3000)
word_features = [x[0] for x in word_tuples]

#word_tuples
word_features




['feel',
 'may',
 'like',
 'get',
 'help',
 'know',
 'want',
 'relationship',
 'time',
 'go',
 'would',
 'good',
 'make',
 'way',
 'thing',
 'people',
 'need',
 'one',
 'work',
 'therapist',
 'life',
 'say',
 'find',
 'think',
 'also',
 'talk',
 'take',
 'well',
 'try',
 'see',
 'tell',
 'family',
 'feeling',
 'someone',
 'love',
 'anxiety',
 'something',
 'issue',
 'person',
 'many',
 'therapy',
 'ask',
 'start',
 'really',
 'even',
 'style',
 'give',
 'change',
 'span',
 'friend',
 'question',
 'year',
 'right',
 'could',
 'first',
 'child',
 'experience',
 'thought',
 'come',
 'counseling',
 'lot',
 'boyfriend',
 'always',
 'self',
 'counselor',
 'able',
 'depression',
 'much',
 'parent',
 'never',
 'problem',
 'normal',
 'look',
 'happen',
 'situation',
 'font',
 'husband',
 'understand',
 'support',
 'partner',
 'learn',
 'care',
 'important',
 'trust',
 'sometimes',
 'new',
 'pt',
 'use',
 'br',
 'keep',
 'back',
 'long',
 'let',
 'still',
 'sex',
 'day',
 'stop',
 'line',
 'past

In [37]:
def vectorize_topic(l):
    new_list = []
    for word in word_features:
        if word in l:
            new_list.append(1)
        else:
            new_list.append(0)
    return new_list

counsel_topic.sentence = counsel_topic.sentence.apply(vectorize_topic)

counsel_topic.head()

  after removing the cwd from sys.path.


Unnamed: 0,topic,sentence
0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [39]:
counsel_topic.to_csv('../data/counsel_vect_topic.csv', index=False, sep='\t')

In [49]:
counsel_topic.reset_index(inplace=True)
counsel_topic.to_json('../data/counsel_vect_topic.json')

In [40]:
counsel_test = pd.read_csv('../data/counsel_vect_topic.csv', sep='\t')
counsel_test

Unnamed: 0,topic,sentence
0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
60673,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
60674,2,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
60675,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
60676,2,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [43]:
type(counsel_test.sentence[0])

str

In [51]:
counsel_test2 = pd.read_json('../data/counsel_vect_topic.json')
type(counsel_test2.sentence[0])

list