In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re
import string
from sklearn.model_selection import train_test_split
import nltk
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def stem(content):
    title_stem = re.sub(r"\bhttp"," ",content)
    title_stem = re.sub(r"[^a-zA-Z]", " ", title_stem)
    title_stem = re.sub("\d"," ",title_stem)
    title_stem = re.sub("\W"," ",title_stem)
    title_stem = title_stem.lower()
    title_stem = title_stem.translate(str.maketrans("","", string.punctuation))
    title_stem = title_stem.split()
    title_stem = [word for word in title_stem if not word in stop_words]
    title_stem = ' '.join(filter(str.isalpha, title_stem))
    return title_stem

In [None]:
train['text']= train['text'].apply(stem)

In [None]:
train['words']=[len(x.split()) for x in train['text'].tolist()]

In [None]:
train['words'].describe()

In [None]:
max_length_value=15
train[train['words']<=max_length_value].count()

In [None]:
train[train['words']>max_length_value].count()

In [None]:
min_length_value=1
train[train['words']<=min_length_value].count()

In [None]:
train=train[train['words']>min_length_value]

In [None]:
train[train['words']==min_length_value].count()

In [None]:
from collections import Counter
count= Counter()
def count_word(text):
    for x in text.values:
        for word in x.split():
            count[word]+=1
    return count
counter= count_word(train.text)

In [None]:
len(counter)

In [None]:
counter.most_common(5)

In [None]:
features, targets= train['text'], train['target']
all_train_features, val_features, all_train_targets, val_targets = train_test_split(
        features, targets,
        train_size=0.8,
        random_state=42,
        shuffle = True,
        stratify=targets
    )

In [None]:
train_data= tf.data.Dataset.from_tensor_slices((all_train_features.to_numpy(), all_train_targets.to_numpy())) 
val_data= tf.data.Dataset.from_tensor_slices((val_features.to_numpy(),val_targets.to_numpy()))

In [None]:
train_data.element_spec

In [None]:
val_data.element_spec

In [None]:
for text,label in train_data.take(3):
    print('Text: ',text.numpy())
    print('Label: ',label.numpy())

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
train_data = train_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_data = val_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train_data.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

In [None]:
vocab_size= 20000
max_len= 15
vectorize_layer = TextVectorization(
    max_tokens=vocab_size + 2,
    split="whitespace",
    output_mode="int", #tf-idf / int / binary / count
    output_sequence_length=max_len,
)

In [None]:
vectorize_layer.adapt(train_data.map(lambda text, label: text))
vectorize_layer.adapt(val_data.map(lambda text, label: text))

In [None]:
vocab = np.array(vectorize_layer.get_vocabulary())
vocab[:20]

In [None]:
encoded_example = vectorize_layer(example)[:3].numpy()
encoded_example

In [None]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

In [None]:
vocab_size= 20000
max_len= 15
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(input_dim= vocab_size+1,output_dim=max_len,mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, activation='tanh',return_sequences=True)),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.LSTM(64,return_sequences=True),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1,activation='sigmoid')
    ])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
print(model.summary())

In [None]:
print([layer.supports_masking for layer in model.layers])

In [None]:
tf.keras.utils.plot_model(model,show_shapes=True)

In [None]:
histor = model.fit(train_data, epochs=47,
                    validation_data= val_data,
                    validation_steps=50,batch_size=64)

In [None]:
history_dict = histor.history
history_dict.keys()

In [None]:
acc = history_dict['accuracy']
loss = history_dict['loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, '*', label='Training loss')
# b is for "solid blue line"
plt.title('Training  loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.title('Training  acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.plot(epochs, acc, 'b');

In [None]:
test= pd.read_csv("../input/nlp-getting-started/test.csv")
test['text'] = test['text'].apply(stem)

In [None]:
test_data = test['text']
test_dat_array = test_data.to_numpy()

In [None]:
test_dat_array = tf.data.Dataset.from_tensor_slices((test_dat_array))

In [None]:
for text in test_dat_array.take(2):
    print('Text: ', text.numpy())
    vector= vectorize_layer(text).numpy()
    print("vectorized text= ",vector)

In [None]:
test_dat_array = test_dat_array.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
prediction = model.predict(test_dat_array)

In [None]:
prediction_target = []
for pred in prediction:
    if pred >= 0.5:
        prediction_target.append(1)
    else: 
        prediction_target.append(0)

In [None]:
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
submission

In [None]:
submission['target'] = prediction_target
submission

In [None]:
submission['target'].value_counts()

In [None]:
target_value= list(submission['target'].unique())

In [None]:
target_value.sort()

In [None]:
fig = px.bar(submission, x= target_value, y= submission['target'].value_counts(),color=("blue","red"), title="Long-Form Input")
fig.show()

In [None]:
submission.to_csv("submission.csv",index=False)