In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense, Dropout, Conv1D, GlobalMaxPool1D

In [None]:
# read csv file
tweets = []
labels = []

with open('../input/nlp-getting-started/train.csv','r') as csv_file:
    reader = csv.reader(csv_file, delimiter=',')
    next(reader)
    for row in reader:
        tweets.append(str(row[3]))
        labels.append(int(row[4]))

dataset_size = len(tweets)
print('Dataset size = ',dataset_size)

### use regex to clean tweets

### Special Characters
#### In the regex flavors discussed in this tutorial, there are 12 characters with special meanings: 
##### the caret ^
##### the dollar sign $
##### the vertical bar or pipe symbol |
##### the opening parenthesis (
##### the closing parenthesis )
##### the opening square bracket [
##### the opening curly brace {

##### \w  represents any alphanumeric characters (including underscore)
##### \d  represents any digit
##### .   represents ANY character (do not confuse it with a period )
##### abc literally matches characters abc in a string
##### [abc] matches either a or b or c (characters within the brackets)
##### ?   after a character indicates that the character is optional
##### *   after a character indicates it can be repeated 0 or more times
##### +   after a character indicates it can be repeated 1 or more times
##### \   is used to escape special characters

In [None]:
# punctuation characters
import string
print(string.punctuation)

In [None]:
# english stop words
import nltk
stopwords_list=nltk.corpus.stopwords.words('english')

In [None]:
# exclude negation, superlative, 
exclude_list = ['but', 'against', 'on', 'off', 'over', 'all', 'any', 'most', 'no', 'nor', 'not', 'so', 'too', 'very', "don't", "aren't",
                     "couldn't", "didn't", "doesn't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "needn't", "shan't", 
                     "shouldn't", "wasn't", "weren't", "won't", "wouldn't"]

for word in exclude_list:
    stopwords_list.remove(word)

print(stopwords_list)

In [None]:
import re
# clean tweets function

def clean_tweets(tweets):
    
    cleaned_tweets = []
    ps = PorterStemmer()
    
    for tweet in tweets:
#         delete urls
        clean_tweet = re.sub(r'https?://(\w+\.)(\w+/\w+)?','',tweet.lower())
#         delete dates in different formats "xx/xx/xxxx xx/xx/xx xx/xx xx-xx-xxxx xx.xx.xxxx
        clean_tweet = re.sub(r'\(?\d\d?.\d\d?(.\d\d)?(\d\d)?\)?','',clean_tweet)
#         delete time in different formats
        clean_tweet = re.sub(r'\(?@?\d\d?:\d\d(\sPM)?','',clean_tweet)
#         delete @someone
        clean_tweet = re.sub(r'@\w*[\s$]',' ',clean_tweet)
#         delete numbers
        clean_tweet = re.sub(r'\d+',' ',clean_tweet)
#         delete unique alphanumeric character
        clean_tweet = re.sub(r'\s\w\s',' ',clean_tweet)
#         delete strange words with or without ponctuation
        clean_tweet = re.sub(r'(ˆ|\s)\S+[\.\?\"\\\$\ˆ\*\+\-_@&=÷;,%/\d]+[ˆ\s\.]*[\s$]',' ',clean_tweet)
        clean_tweet = re.sub(r'\s\W\S*[\s$]',' ',clean_tweet)
#         delete stop words ans stemming remaining words 
        clean_tweet = ' '.join([word for word in clean_tweet.split() if word not in stopwords_list])        

        cleaned_tweets.append(clean_tweet)
        
    return cleaned_tweets

In [None]:
# clean trainig dataset
cleaned_tweets = clean_tweets(tweets)

In [None]:
# show train sample before and after cleaning
start = 250
end = 270

for id in range(start,end):
    print('original : ',tweets[id])
    print('_cleaned : ',cleaned_tweets[id],'\n')

In [None]:
# check the max_length of the majority of cleaned tweets
length_090 = length_100 = length_110 = 0

for x in cleaned_tweets:
    if len(x)<90: length_090+=1        
    if len(x)<100: length_100+=1
    if len(x)<110: length_110+=1

print('length_090 =', length_090*100/dataset_size)
print('length_100 =', length_100*100/dataset_size)
print('length_110 =', length_110*100/dataset_size)


In [None]:
# nlp preprocessing params
train_size = 7000
vocab_size = 10000
oov_tok = '<OOV>'
embedding_dim=32
trunc_type = 'pre'
padding_type = 'pre'
max_length = 100

In [None]:
# split dataset to train and validation sets
train_tweets = np.array(cleaned_tweets[:train_size])
train_labels = np.array(labels[:train_size])
valid_tweets = np.array(cleaned_tweets[train_size:])
valid_labels = np.array(labels[train_size:])

In [None]:
# tokenization
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_tweets)
word_index = tokenizer.word_index
print(len(word_index))

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_tweets)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

valid_sequences = tokenizer.texts_to_sequences(valid_tweets)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
print(word_index)

In [None]:
model_conv = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Dropout(0.2),
    Conv1D(128, 7, activation='relu'),
    GlobalMaxPool1D(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(lr=1e-4)

model_conv.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model_conv.summary()

history1 = model_conv.fit(
train_padded,
train_labels,
epochs=30,
validation_data=(valid_padded, valid_labels),
batch_size=32)

In [None]:
# Preparing test data

# read csv file
test_tweets = []
test_id = []


with open('../input/nlp-getting-started/test.csv','r') as csv_file:
    reader = csv.reader(csv_file, delimiter=',')
    next(reader)
    for row in reader:
        test_tweets.append(str(row[3]))
        test_id.append(row[0])
        
# clean test tweets
cleaned_test_tweets = clean_tweets(test_tweets)

test_sequences = tokenizer.texts_to_sequences(cleaned_test_tweets)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# show test sample before and after cleaning

start = 100
end = 110

for id in range(start,end):
    print('original : ', test_tweets[id])
    print('_cleaned : ', cleaned_test_tweets[id],'\n')

In [None]:
# Prediction
test_pred = model_conv.predict(test_padded)
print(test_pred.shape)
test_pred

In [None]:
test_pred_bool = test_pred.copy().astype(int)
for index in range(len(test_pred)): 
    if test_pred[index]>0.6:
        test_pred_bool[index]=1
    else:
        test_pred_bool[index]=0

In [None]:
# prepare submission file
prediction_file = pd.DataFrame(columns=['id','target'])
prediction_file['id']= test_id
prediction_file['target']= test_pred_bool

In [None]:
# show test prediction sample
prediction_file.head()
start = 0
end = 10
for index in range(start,end):
    print(test_tweets[index])
    print(test_pred_bool[index],'\n')

In [None]:
print('Positive tweets prediction = ', len(prediction_file[prediction_file['target']==1]))
print('Negative tweets prediction = ', len(prediction_file[prediction_file['target']==0]))

In [None]:
prediction_file.to_csv('submission.csv', index=False)