In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# First input file is about interactions with a chatbot for mental health help

df = pd.read_csv('../input/deepnlp/Sheet_1.csv')

In [None]:
df.head()

In [None]:
df.isna().mean()

In [None]:
df[~df['Unnamed: 3'].isna()]

In [None]:
# discovered that some text may have gotten moved. Let's make sure it gets included

for response_text, un3, un5, un7 in zip(df['response_text'].values,df['Unnamed: 3'].values,df['Unnamed: 5'].values,df['Unnamed: 7'].values):
    if un7:
        print(un5)

In [None]:
# checked and it is literally just that one instance

df.iloc[55]['response_text']

In [None]:
df.iloc[55]['Unnamed: 3']

In [None]:
df.iloc[55]['response_text'] = df.iloc[55]['response_text'] + df.iloc[55]['Unnamed: 3']
# get a warning here but that is okay. It is in general, not a good idea to do this

In [None]:
drop_cols = [col for col in df.columns.to_list() if 'Unn' in col] # drop all the unnamed columns
df.drop(columns = drop_cols, inplace = True)

In [None]:
df.head()

In [None]:
df['class'].value_counts()
# we see class imbalance here, so we will have to take care when doing our modeling

In [None]:
df['class'] = df['class'].map({
    'not_flagged':0,
    'flagged':1
})

In [None]:
df

In [None]:
#now we need to prepare the text for modeling
df['response_text'].values

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(
    num_words=None, 
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
    lower=True,
    split=' ', 
    char_level=False, 
    oov_token=None, 
    document_count=0
)



In [None]:
tokenizer.fit_on_texts(df['response_text'].values)

In [None]:
tokenized_data = tokenizer.texts_to_matrix(df['response_text'].values)

padded_data = pad_sequences(tokenized_data)

In [None]:
padded_data.shape

In [None]:
vocab_length = len(tokenizer.word_index)

In [None]:
# Load word Embeddings from Glove
embeddings_index = {}
f = open(os.path.join('/kaggle/input/glove100d/', 'glove.6B.100d.txt')) 
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32') 
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# prepare initialization weights for embedding layer

embedded_weights = np.zeros((vocab_length, 100)) #100 is the embedding dimension because we picked the 100d Glove file

for word, i in tokenizer.word_index.items(): 
    word_vector = embeddings_index.get(word)
    
    if word_vector is not None: 
        embedded_weights[i] = word_vector

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
model = keras.Sequential()

In [None]:
model.add(
    layers.Embedding(vocab_length, 100)
)
model.add(
    layers.Dense(32,
                 activation = 'relu'
                )
)
model.add(
    layers.Dense(1,
                 activation = 'sigmoid'
                )
)

In [None]:
model.summary()

In [None]:
#set embedded weights for embedding layer & disable training

model.layers[0].set_weights([embedded_weights])
model.layers[0].trainable = False

In [None]:
model.summary()

In [None]:
model.compile(optimizer='rmsprop', 
              loss='binary_crossentropy',
              metrics=['acc'])

In [None]:
target = df['class'].values

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skfold = StratifiedKFold(n_splits = 3,
                         random_state = 31415,
                         shuffle = True
                        )

In [None]:
for train_ix, test_ix in skfold.split(padded_data,target):
    
    train_data,test_data = padded_data[train_ix],padded_data[test_ix]
    train_target,test_target = target[train_ix],target[test_ix]

In [None]:
train_data.shape

In [None]:
train_target.shape

In [None]:
history = model.fit(train_data, train_target,
                    epochs=10,
                    batch_size=32,
                    validation_data=(test_data, test_target))

In [None]:
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc'] 
loss = history.history['loss'] 
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc') 
plt.plot(epochs, val_acc, 'b', label='Validation acc') 
plt.title('Training and validation accuracy') 
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss') 
plt.plot(epochs, val_loss, 'b', label='Validation loss') 
plt.title('Training and validation loss')
plt.legend()
plt.show()