In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from keras import Sequential
from keras.layers import GRU, Embedding, Dense
from keras.callbacks import EarlyStopping
from keras.metrics import AUC

In [None]:
#import train
train = pd.read_csv('train.csv')
train = train.replace(np.nan, '', regex=True)
#import test
test = pd.read_csv('test.csv')
#import sample submission
samp = pd.read_csv('sample_submission.csv')

In [None]:
#EDA
print(train.info())
print(train.head())

In [None]:
#Instances By Class
train['target'].value_counts().plot(kind = 'bar', color = ['blue', 'orange'])
plt.title('Instances by Class')
plt.xlabel('Classes')
plt.ylabel('Instances')
plt.show()

In [None]:
#Charts for null values in keyword and location by class
plt.subplot(1, 2, 1)
train['target'][train['keyword'] == ''].value_counts().plot(kind = 'bar', color = ['orange', 'blue'])
plt.title('Keyword Null Values by Class')
plt.xlabel('Classes')
plt.ylabel('Keyword Null Values')
plt.subplot(1, 2, 2)
train['target'][train['location'] == ''].value_counts().plot(kind = 'bar', color = ['blue', 'orange'])
plt.title('Location Null Values by Class')
plt.xlabel('Classes')
plt.ylabel('Location Null Values')
plt.tight_layout()
plt.show()

In [None]:
##Data Preprocessing
#removing what is not word sentence or number in text 
train['text'] = train['text'].str.replace(r'[^\w\s]+', '', regex = True)
#removing URLs
train['text'] = train['text'].str.replace(r'https?://(www\.)?(\w+)(\.\w+)(/\w*)?', '', regex = True)
#removing words with numbers
train['text'] = train['text'].str.replace(r'\w*\d\w*', '', regex= True)

#making all words lowercase
train['text'] = train['text'].apply(lambda x: x.lower())

#removing html tags with Beautiful Soup
def html_remove(x):
    return BeautifulSoup(x, 'lxml').get_text()
train['text'] = train['text'].apply(lambda x: html_remove(x))

#Removing stop words
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens_no_stop = [i for i in tokens if i not in stop_words]
    tokens_filtered = (' ').join(tokens_no_stop)
    return tokens_filtered
train['text'] = train['text'].apply(lambda x: remove_stopwords(x))

In [None]:
#Tokenizing Text train data
#source: https://medium.datadriveninvestor.com/padding-used-in-nlp-are-they-improvers-2f4613bd3648
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train['text'])
word_index = tokenizer.word_index
vocab_size = len(list(word_index.keys())) + 1
sequences = tokenizer.texts_to_sequences(train['text'])
padded_sequences = pad_sequences(sequences, padding='post', truncating='post', maxlen=vocab_size)
# print(padded_sequences)


#Tokenizing Text test
sequences_test = tokenizer.texts_to_sequences(test['text'])
padded_sequences_test = pad_sequences(sequences_test, padding='post', truncating='post', maxlen=vocab_size)

In [None]:
print(padded_sequences.shape)
print(padded_sequences_test.shape)

In [None]:
len(list(word_index.keys()))

In [None]:
#getting X and y
X = train['text']
y = train['target']
#splitting into training and validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
# #TFIDF for Train data
# vectorizer = TfidfVectorizer(stop_words='english')
# vector_tfidf = vectorizer.fit_transform(X_train)

# #TFIDF for Valid data
# vector_tfidf_test = vectorizer.transform(X_test)


# #Vocabulary
# vocabulary = np.array(vectorizer.get_feature_names_out())

In [None]:
# print(vocab_size)

In [None]:
vocab_size = len(list(word_index.keys())) + 1
#model
model = Sequential([
    Embedding(vocab_size, 128, input_length=padded_sequences.shape[1]),
    GRU(64),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.summary()

# ROC = AUC()
model.compile(
    optimizer = 'adam',
    loss='binary_crossentropy',
    metrics = ['accuracy']
)

In [None]:
callback = EarlyStopping(patience = 3)
model.fit(
    padded_sequences, 
    y, 
    validation_split=0.2, 
    epochs = 30, 
    batch_size=256,
    callbacks=[callback]
)

In [None]:
predictions = model.predict()

In [None]:
###Test Data

##Data Preprocessing
#removing what is not word sentence or number in text 
test['text'] = test['text'].str.replace(r'[^\w\s]+', '', regex = True)
#removing URLs
test['text'] = test['text'].str.replace(r'https?://(www\.)?(\w+)(\.\w+)(/\w*)?', '', regex = True)
#removing words with numbers
test['text'] = test['text'].str.replace(r'\w*\d\w*', '', regex= True)

#making all words lowercase
test['text'] = test['text'].apply(lambda x: x.lower())

#removing html tags with Beautiful Soup
def html_remove(x):
    return BeautifulSoup(x, 'lxml').get_text()
test['text'] = test['text'].apply(lambda x: html_remove(x))

#Removing stop words
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens_no_stop = [i for i in tokens if i not in stop_words]
    tokens_filtered = (' ').join(tokens_no_stop)
    return tokens_filtered
test['text'] = test['text'].apply(lambda x: remove_stopwords(x))





In [None]:
padded_sequences_test.shape

In [None]:
predictions = model.predict(padded_sequences_test)