## Tweets Prediction using Pre-Trained GloVe Embedding

### Import the necessary libraries

In [None]:
import pandas as pd
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

### Read/View the train dataset

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train.head()

### Split the given dataset into train and test dataset (80:20 ratio)

In [None]:
split_ratio = 0.8
sentences = train.text.values
labels = train.target.values
training_size = int(len(sentences) * split_ratio)
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]


### Define the tokenizer

In [None]:
t = Tokenizer()
t.fit_on_texts(training_sentences)
vocab_size = len(t.word_index) + 1
# integer encode the documents
train_encoded_sentences = t.texts_to_sequences(training_sentences)
test_encoded_sentences = t.texts_to_sequences(testing_sentences)

# pad documents to a max length of 125 words
max_length = 125
training_padded = pad_sequences(train_encoded_sentences, maxlen=max_length, padding='post')
testing_padded = pad_sequences(test_encoded_sentences, maxlen=max_length, padding='post')


### Load the whole embedding into memory and create embedding vectors

In [None]:
glove_size = 300
embeddings_index = dict()
f = open('/kaggle/input/glove6b/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training sentences
embedding_matrix = zeros((vocab_size, glove_size))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Define the Model

In [None]:
# define model
model = Sequential()
#we do not want to update the learned word weights in this model, therefore we will set the 
#trainable attribute for the model to be False.
embed = Embedding(vocab_size, glove_size, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(embed)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

### Train the model

In [None]:
# fit the model
num_epochs = 5
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)


### Predict the test data

In [None]:
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_sentences = test.text.values
test_sequences = t.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

lst = []
for x in model.predict(test_padded):
    lst.append((lambda x: 0 if x < 0.5 else 1)(x))
sample_submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
sample_submission["target"] = lst
sample_submission.to_csv("submission.csv", index=False)

In [None]:
sample_submission