In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

import keras.backend as K

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
trainDataSet = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv', index_col=0)
trainDataSet = trainDataSet.sample(frac = 1) 
# print(trainDataSet.head())
# print(trainDataSet.columns)

allTweetText = list(trainDataSet['text'])
allTweetLabels = list(trainDataSet['target'])

# trainingSize = int(len(allTweetText)*0.70)

# trainTweetText = allTweetText[0:trainingSize]
# trainTweetLabels = allTweetLabels[0:trainingSize]

# validationTweetText = allTweetText[trainingSize:]
# validationTweetLabels = allTweetLabels[trainingSize:]

In [None]:
vocabularySize = 22701
sentenceSize = 50

tokenizer = Tokenizer(num_words=vocabularySize,oov_token='<OOV>')
tokenizer.fit_on_texts(allTweetText)


trainSequence = tokenizer.texts_to_sequences(allTweetText)
trainSequence_padded = pad_sequences(trainSequence, padding='pre', truncating='pre', maxlen=sentenceSize)

# validateSequence = tokenizer.texts_to_sequences(validationTweetText)
# validateSequence_padded = pad_sequences(validateSequence, padding='pre', truncating='pre', maxlen=sentenceSize)


trainSequence_padded = np.array(trainSequence_padded)
trainTweetLabels = np.array(allTweetLabels)

# validateSequence_padded = np.array(validateSequence_padded)
# validationTweetLabels = np.array(validationTweetLabels)

print(trainSequence_padded[0])
# print(validateSequence_padded[0])

In [None]:
print(max([len(x) for  x in trainSequence]))
print(len(tokenizer.word_index))


In [None]:
model_seq = tf.keras.models.Sequential([
tf.keras.layers.Embedding(input_dim=vocabularySize,input_length=sentenceSize, output_dim=16),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(units=16, activation='relu'),
tf.keras.layers.Dense(units=1, activation='sigmoid'),
])

model.compile(loss='binary_crossentropy',  optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy',f1_metric])
model.summary()

In [None]:
epochCount = 100
history = model.fit(trainSequence_padded, trainTweetLabels, batch_size=32,
                    validation_split = 0.3,
                    epochs=epochCount, verbose=2, shuffle=True)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
    
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
plot_graphs(history, "f1_metric")

In [None]:
testDataSet = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv', index_col=0)

print(testDataSet.head())
print(testDataSet.columns)

testDataSetText = list(testDataSet['text'])

print(testDataSetText[0])
tokenizer.fit_on_texts(testDataSetText)

print(len(tokenizer.word_index))

testSeq = tokenizer.texts_to_sequences(testDataSetText)
testSeq_padding = pad_sequences(testSeq,padding='pre', truncating='pre', maxlen=sentenceSize)

x = np.round(model.predict(testSeq_padding))

print(x)
    



In [None]:
testDataSet['target'] = x
print(testDataSet.head())


import os
# os.chdir(r'kaggle/working')

testDataSet.to_csv(r'df_name.csv')


from IPython.display import FileLink
FileLink(r'df_name.csv')