## Text Classification

- Taking the text into a list
- Tokenize, Sequence & Padded
- Build model 
- Feed the padded sentence and label
- Download The Vectorized Embedded File and See Difference
- Predict A Sentence

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

###### Taking The text into a list

In [None]:
train_data, test_data = ,

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.tonumpy()) is needed in Python3, s.numpy() is for Python2 if data is from tensorflow dataset

for s,l in train_data:
  training_sentences.append(s.tonumpy().decode('utf8'))
  training_labels.append(l.tonumpy())
  
for s,l in test_data:
  testing_sentences.append(s.tonumpy().decode('utf8'))
  testing_labels.append(l.tonumpy())
  
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [None]:
training_sentences

In [None]:
len(training_sentences)

##### Tokenize, Sequence & Padded

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"


tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

##### Reversing the padded sentence

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[3]))
print(training_sentences[3])

##### Build model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

##### Embedding The Results of  Binary Classification

In [None]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

##### Downloading The Embedded File

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

##### Predicting The Text Classification

In [None]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences([sentence])
padded = pad_sequences(sequences, maxlen= max_length, truncating= trunc_type)
print(model.predict(padded))

In [None]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
print(model.predict(padded))

## Sequence Text

- Split the text into lines  - corpus
- Tokenize, get the sequnce in a list
- Make the sequence a array
- Split the last value into labels ys and others in xs
- Build the model and feed the data

#####  Split The Text Into Lines - Corpus

In [None]:
data = "Look the new me is really still the real me\nI swear you gotta feel me before they try and kill me\nThey gotta make some choices they running out of options\n'Cause I've been going off and they don't know when its stopping\nAnd we came to the top and I see that you've been learning\nAnd when I take you shopping you spend it like you earned it\nAnd when you popped off on your ex he deserved it\nI thought you were the one from the jump, that confirmed it\nTrapMoneyBenny\nI buy you champagne but you love some Henny\nFrom the block like you Jenny\nI know you special girl 'cause I know too many\n'Resha, do you love me\nAre you riding, say you'll never ever leave\nFrom beside me, 'cause I want ya, and I need ya\nAnd I'm down for you always\nJT, do you love me\nAre you riding say you'll never ever leave\nFrom beside me, 'cause I want ya, and I need ya\nAnd I'm down for you alwaysTwo bad bitches and we kissing in the wraith\nKiss-kissing in the wraith, kiss-kissing in the wraith\nI need that black card and a code to the safe\nCode to the safe, code-code to the safe\nI show 'em how to network, fuck that Netflix and chill\nWhat's your net-net-net worthCause I want ya, and I need ya\nI got a new boy, and that nigga trap!Kiki, do you love me\nAre you riding, say you'll never ever leave\nFrom beside me, 'cause I want you, and I need you\nAnd I'm down for you always\nKB, do you love me\nAre you riding say you'll never ever leave\nFrom beside me, 'cause I want you, and I-Bring that ass, bring that ass, bring that ass back\nB-bring that ass, bring that ass, bring that ass back\nShawty say the nigga that she with can't hit\nBut, shawty, I'ma hit it, hit it like I can't miss\nClap that ass, you're the only one I love, clap that ass\nBring that ass back\nYou're the only one I love, let's go, let's go, lets goTrap, TrapMoneyBenny\nThis shit got me in my feelings\nI just gotta be real with it, yupI don't even care,\nI'm just being real, my shit look..."

In [None]:
corpus = data.lower().split('\n')

In [None]:
corpus

##### Tokenize, get the sequence in a list

In [None]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

In [None]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(corpus)
total_words= len(tokenizer.word_index) + 1


input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

##### Make the Sequence in a array

In [None]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

##### Splits The Last Value as label ys and others as features xs

In [None]:
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

##### Build the model and feed the data

In [None]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(20, return_sequences= True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(xs, ys, epochs=500, verbose=1)

##### Validate The Accuracy

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

plot_graphs(history, 'accuracy')

##### Predict A Sequnce of Word

In [None]:
seed_text = "Should we start"
next_words = 50
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)

######                                                                                                  End