<h1 style = "text-align: center; color: #6E3A91;"> ASTROBOT </h1>
<h2 style = "text-align: center; font-weight: lighter;"> Boost your Mental Health </h2>

In [9]:
### Dependencies ###

In [20]:
import json, random, re, string, pickle
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
## Phase I - Data Cleaning ##

In [21]:
# Setting up the data
with open('../data/root.json') as file:
    data = json.load(file)

In [None]:
## Phase II - Preparing data for training ##

In [23]:
# Natural Language Processing (NLP) #

In [24]:
# Instances
lemmatizer = WordNetLemmatizer()
tokenizer = TreebankWordTokenizer()

In [25]:
tokenized_words = []
categories = []
directories = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        # Tokenization
        tokens = tokenizer.tokenize(pattern)   
        re_punc = re.compile('[%s]' % re.escape(string.punctuation))
        tokens = [re_punc.sub('', w) for w in tokens]
        tokens = [word for word in tokens if word.isalpha()]
        # Lemmatization
        tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
        # Remove stp words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        
        tokenized_words.extend(tokens)
        directories.append((tokens, intent['tag']))
        
        if intent['tag'] not in categories:
            categories.append(intent['tag'])

In [26]:
# Some sorting
tokenized_words = sorted(list(set(tokenized_words)))
categories = sorted(list(set(categories)))

In [27]:
print (len(directories), "directories")
print (len(categories), "categories", categories)
print (len(tokenized_words), "tokenized words", tokenized_words)

143 directories
53 categories ['bot_profile_age', 'bot_profile_info', 'bot_profile_interest', 'bot_profile_location', 'bot_profile_name', 'goodbye', 'greetings', 'group_1', 'group_10', 'group_11', 'group_12', 'group_13', 'group_14', 'group_15', 'group_16', 'group_17', 'group_18', 'group_19', 'group_2', 'group_20', 'group_21', 'group_22', 'group_23', 'group_24', 'group_25', 'group_26', 'group_27', 'group_28', 'group_29', 'group_3', 'group_30', 'group_31', 'group_32', 'group_33', 'group_34', 'group_35', 'group_36', 'group_37', 'group_38', 'group_39', 'group_4', 'group_40', 'group_41', 'group_42', 'group_43', 'group_44', 'group_45', 'group_5', 'group_6', 'group_7', 'group_8', 'group_9', 'thanks']
162 tokenized words ['abandonment', 'affect', 'affected', 'afraid', 'age', 'ago', 'along', 'always', 'anxiety', 'anxious', 'anymore', 'anyone', 'anything', 'ask', 'away', 'awesome', 'back', 'bad', 'better', 'biological', 'break', 'brother', 'bullying', 'bye', 'ca', 'call', 'care', 'case', 'cause'

In [28]:
# To create the traing dataset #

In [29]:
# Encoding (bag of words)

In [31]:
dataset = []
# Create an empty list for the output
output_empty = [0] * len(categories)
# Training set, bag of words for each sentence
for directory in directories:
    # Initialize a bag of words
    bag = []
    # List of tokenized words for the pattern
    pattern_words = directory[0]
    # Create the bag of words array with 1, if word match found in current pattern
    for word in tokenized_words:
        if word in pattern_words:
            bag.append(1)
        else:
            bag.append(0)
            
    # Output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[categories.index(directory[1])] = 1
    
    dataset.append([bag, output_row])

# To randoize the output
random.shuffle(dataset)
dataset = np.array(dataset, dtype = 'object')

In [32]:
# Splitting

In [37]:
x = list(dataset[:, 0])
y = list(dataset[:, 1])
print("X 'pattern: 0': ", x[0])
print("Y 'intent: 0': ", y[0])

X 'pattern: 0':  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Y 'intent: 0':  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [38]:
x_train, y_train, x_test, y_test = x[30:], y[30:], x[:30], y[:30]
# To make sure the dataset is splitted correctly
print(len(x_train), len(y_train), len(x_test), len(y_test))
print(len(x_train[0]), len(y_train[0]), len(x_test[0]), len(y_test[0]))

113 113 30 30
162 53 162 53


In [None]:
## Phase III - Training ##

In [None]:
## Phase IV - Testing ##

In [None]:
## Phase V - Saving the Final Model ##