In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print("Number of positive tweets ",len(train[train["target"] == 0]))
print("Number of negative tweets ",len(train[train["target"] == 1]))

In [None]:
import nltk
import string
import re
def preprocess(tweet):
    
    tweet = re.sub(r'^RT[\s]+', '', tweet) # remove Retweet text
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # remove hyperlinks
    tweet = re.sub(r'#', '', tweet) #remove hashtags
    
    #Tokenizing
    tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=True)
    tokenized_tweet = tokenizer.tokenize(tweet)
    
    #Removing stopwords and punctuation. Stemming the remaining words.
    stemmer = nltk.stem.PorterStemmer()
    english_stopwords = nltk.corpus.stopwords.words('english')
    processed_tweet = []
    for word in tokenized_tweet:
        if(word not in english_stopwords and word not in string.punctuation):
            processed_tweet.append(stemmer.stem(word))
            
    return processed_tweet

In [None]:
#Verifying 
print("Original Tweet: ",train["text"][10])
print("Processed Tweet: ",preprocess(train["text"][10]))

In [None]:
#Building the vocabulary for the model
def vocab_builder(tweets):
    Vocab = {'__PAD__': 0, '__EOL__': 1, '__UNK__': 2}
    for tweet in tweets:
        for word in preprocess(tweet):
            if word not in Vocab:
                Vocab[word] = len(Vocab)
    return Vocab

In [None]:
#Testing on dataset
Vocabulary = vocab_builder(train["text"])
print("Total number of words in the vocabulary: ",len(Vocabulary))
print(Vocabulary)

In [None]:
#Converting tweet to tensor to feed into the neural network
def tensor_builder(tweet,vocab,unk_token):
    tensor = []
    for word in preprocess(tweet):
        word_ID = vocab.get(word,vocab[unk_token])
        tensor.append(word_ID)
    return tensor

In [None]:
#Verifying
print("Original tweet in the dataset: ",train["text"][4])
print("Processed tweet: ",preprocess(train["text"][4]))
print("Tweet converted into tensor: ",tensor_builder(train["text"][4],Vocabulary,'__UNK__'))

In [None]:
#Padding the tensor input vector to the same length
def tensor_pad(tensors):
    max_length = max([len(t) for t in tensors])
    padded_tensors = []
    for t in tensors:
        pad_length = max_length - len(t)
        padded_tensors.append(t + [0]*pad_length)
    return padded_tensors

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
def classifier(X_train,Y_train,X_test,vocab_size,embedding_dim,output_dim):
    model = models.Sequential()
    model.add(tf.keras.Input(shape=(None,), dtype="int64"))
    model.add(layers.Embedding(vocab_size, embedding_dim))
    model.add(layers.Lambda(lambda x: tf.keras.backend.mean(x, axis=1)))
    model.add(layers.Dense(units=output_dim,activation='sigmoid'))
    model.summary()
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.fit(x=X_train,y=Y_train,epochs=20,validation_split=0.2)
    Y_predict = model.predict(X_test)
    return Y_predict

In [None]:
X_train = []
X_test = []
for tweet in train["text"]:
    X_train.append(tensor_builder(tweet,Vocabulary,'__UNK__'))
X_train_padded = np.asarray(tensor_pad(X_train)).astype('int64')
for tweet in test["text"]:
    X_test.append(tensor_builder(tweet,Vocabulary,'__UNK__'))
X_test_padded = np.asarray(tensor_pad(X_test)).astype('int64')

Y_train = np.asarray(train["target"])
Y_predict = classifier(X_train_padded,Y_train,X_test_padded,len(Vocabulary),256,1)

In [None]:
predicted_label = []
for y in Y_predict:
    if y>0.5:
        predicted_label.append(1)
    else:
        predicted_label.append(0)
submission_df = pd.DataFrame({'id' : test["id"],'target' : predicted_label})
submission = submission_df.to_csv('submission.csv',index=False)