In [16]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [22]:
import spacy
#I'll use spacy as it seemed like a good option to lemmatize with the appropriate pos tag, detects pronouns and superlative 
#forms of words.
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

stop_words = list(stopwords.words('english'))
stop_words.append('-PRON-') #remove it as it won't be necessary in our model
stop_words.append('p')
stop_words.append('nbsp') #this is the '<p>' and &nbsp from the html conversion

#Clean the text:
def clean_chat(s):
    s = str(s)
    s = s.lower()                   #remove caps to avoid double words
    s = re.sub('[\W\d]', ' ', s)    #remove special signs, punctuation, numbers
    s = re.sub(' +', ' ', s)        #remove excessive spaces
    s = s.strip()                   #remove first and last spaces
    return s

def spacy_lem(l):
    doc = nlp(l)
    return [token.lemma_ for token in doc]

In [18]:
chatc1 = pd.read_csv('../data/counsel-chat.txt')
chatc2 = pd.read_csv('../data/scrap-counsel-chat.txt') 

chatc1.drop(['questionID','questionUrl','therapistName','therapistUrl','upvotes'], axis=1, inplace=True)
chatc2.drop(['Unnamed: 0','questionID','questionLink','therapistInfo','therapistURL','upvotes','views','split'], axis=1, inplace=True)

chatc1.rename(columns={'topics':'topic'}, inplace=True)

counsel_cat = pd.concat([chatc1, chatc2])

counsel_cat['sentence'] = counsel_cat.questionTitle.fillna('') +' '+ counsel_cat.questionText.fillna('') +' '+ counsel_cat.answerText.fillna('')
counsel_cat.drop(['questionTitle', 'questionText', 'answerText'], axis=1, inplace=True)

counsel_cat.topic = counsel_cat.topic.apply(lambda x: str(x).lower())
counsel_cat.topic = counsel_cat.topic.apply(lambda x: str(x).strip())

counsel_cat.replace('nan', np.nan, inplace=True)
counsel_cat.dropna(inplace=True)

counsel_cat.replace('-', ' ', regex=True, inplace=True)

topic = counsel_cat.topic.unique()
topic = [str(word).split(',') for word in topic]
topic = [word for lst in topic for word in lst]
#Ok that's a lot of topics, let's try to reduce them not by deleting but make some topics more general
topic = np.unique(topic)

"""
- Substance abuse and Addiction are related topics -> ADDICTION,
- Anxiety and stress are realted topics -> STRESS,
- Relationships, social relationships, relationship dissolution, Marriage are related topics -> RELATIONSHIPS,
- Children and Adolescents, Family Conflict, Parenting, Alzheimer's are related topics -> FAMILY,
- Career Counseling, Professional Ethics, Workplace Relationship are related topics -> WORKPLACE,
- Human Sexuality and Intimacy -> SEXUALITY
- Counseling fundamentals, Legal & Regulatory, Military Issues and Diagnosis -> COUNSELING
- Behaviorall change, anger management -> BEHAVIOR

'SPIRITUALITY' = 1
'COUNSELING' = 2

'WORKPLACE' = 3
'FAMILY' = 4
'RELATIONSHIPS' = 5
'SLEEP' = 6
'BEHAVIOR' = 7
'SEXUALITY' = 8
'SELF_ESTEEM' = 9
'GRIEF' = 10
'TRAUMA' = 11

'STRESS' = 12
'EATING_DISORDERS' = 13
'ADDICTION' = 14
'DEPRESSION' = 15
'LGBTQ' = 16

'DOMESTIC_VIOLENCE' = 17
'SELF_HARM' = 18
"""

topic_dict = {'addiction':'14',
              "alzheimer's":'4',
              'anger management':'7',
              'anxiety':'12',
              'behavioral change':'7',
              'career counseling':'3',
              'children & adolescents':'4',
              'children adolescents':'4',
              'counseling fundamentals':'2',
              'depression':'15',
              'diagnosis':'2',
              'domestic violence':'17',
              'eating disorders':'13',
              'family conflict':'4',
              'grief and loss':'10',
              'human sexuality':'8',
              'intimacy':'8',
              'lgbtq':'16',
              'legal & regulatory':'2',
              'legal regulatory':'2',
              'marriage':'5',
              'military issues':'2',
              'parenting':'4',
              'professional ethics':'3',
              'dissolution':'5',
              'relationships':'5',
              'relationship':'5',
              'self esteem':'9',
              'self harm':'18',
              'sleep improvement':'6',
              'social':'5',
              'spirituality':'1',
              'stress':'12',
              'substance abuse':'14',
              'trauma':'11',
              'workplace':'3'}

for key in topic_dict:
    counsel_cat.topic = counsel_cat.topic.str.replace(key, topic_dict[key], regex=False)

#topic_red = counsel_cat.topic.unique()

def sort_topics(s):
    s = str(s)
    s = re.findall('\d+', s)
    s = [int(number) for number in s]
    return max(s)  

#sort_topics(counsel_cat.topic[1])

counsel_cat.topic = counsel_cat.topic.apply(sort_topics)

counsel_cat

Unnamed: 0,topic,sentence
0,4,Escalating disagreements between mother and wi...
1,14,I'm addicted to smoking. How can I stop? I'm p...
2,4,Keeping secrets from my family I have secrets ...
3,7,The Underlying Causes of Being Possessive I am...
4,12,Can I control anxiety without medication? I ha...
...,...,...
2124,2,What happens in a counseling session? After fi...
2125,2,What happens in a counseling session? After fi...
2126,2,What happens in a counseling session? After fi...
2127,2,What happens in a counseling session? After fi...


In [23]:
counsel_cat.sentence = counsel_cat.sentence.apply(clean_chat)
counsel_cat.sentence = counsel_cat.sentence.apply(spacy_lem)

counsel_cat.sentence = counsel_cat.sentence.apply(lambda x: np.unique([word for word in x if word not in stop_words]))

all_sentence = list(counsel_cat.sentence)
bow_sentence = [word for lst in all_sentence for word in lst]
all_words = nltk.FreqDist(bow_sentence)

#Get 3000 most common words
word_tuples = all_words.most_common(3000)
word_features = [x[0] for x in word_tuples]

#word_tuples
word_features

def vectorize_topic(l):
    new_list = []
    for word in word_features:
        if word in l:
            new_list.append(1)
        else:
            new_list.append(0)
    return new_list

counsel_cat.sentence = counsel_cat.sentence.apply(vectorize_topic)

In [68]:
type(counsel_cat.sentence[0])

list

In [29]:
counsel_cat.reset_index(inplace=True)
counsel_cat.to_json('../data/counsel_text_3000.json')

In [30]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import json

In [69]:
counsel = counsel_cat.sample(frac=1)

X = counsel.sentence
y = counsel.topic

X_train = np.vstack(X[:2600])
X_test = np.vstack(X[2600:])

y_train = y[:2600]
y_test = y[2600:]

y_train

2510     7
1056    12
2865     4
2380     5
1520    15
        ..
2310     5
2507     7
1652    15
2351     5
79      12
Name: topic, Length: 2600, dtype: int64

In [70]:
model = tf.keras.models.Sequential()
#model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(1000, activation = tf.nn.relu))
model.add(tf.keras.layers.Dense(1000, activation = tf.nn.relu))
model.add(tf.keras.layers.Dense(1000, activation = tf.nn.relu))
model.add(tf.keras.layers.Dense(19, activation = tf.nn.softmax))

model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2211cea5908>

In [71]:
val_loss, val_acc = model.evaluate(X_test, y_test)
print(val_loss, val_acc)

1.1861701011657715 0.7372627258300781


In [72]:
predic = model.predict([X_test])
print(predic)
print(y_test)

Consider rewriting this model with the Functional API.
[[4.9297766e-10 4.7433739e-05 1.1202094e-06 ... 5.4058578e-04
  3.3883800e-06 4.9867435e-07]
 [3.4647307e-10 3.1541713e-06 5.4678430e-06 ... 7.7109507e-05
  5.7253342e-06 7.2889868e-08]
 [6.3856127e-14 4.1078501e-08 3.4451190e-08 ... 1.9922632e-07
  5.4727035e-08 2.1641214e-10]
 ...
 [1.1167561e-11 5.8293222e-07 9.9702150e-01 ... 2.2194665e-06
  2.9729316e-07 6.5831905e-08]
 [1.3016181e-12 1.0136028e-08 3.3463814e-06 ... 8.6897370e-07
  1.5025297e-07 4.8088973e-09]
 [3.2794394e-12 9.4900074e-08 6.5463697e-05 ... 5.1656412e-09
  7.7361577e-08 1.3182629e-04]]
2822     4
1390    14
3002     5
2307     5
2074     4
        ..
445     15
2715     8
3515     2
1978    12
1562    15
Name: topic, Length: 1001, dtype: int64


In [67]:
print(np.argmax(predic[0]))

15
