In [None]:
# modeling
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as L
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

# result
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')          # graph settings
plt.rcParams['figure.figsize'] = (12,5)    # graph settings

# data preprocessing
from sklearn.model_selection import train_test_split

# data wrangling
import numpy as np 
import pandas as pd

# corpus
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# string manipulation
import re
import spacy
import collections

In [None]:
import nltk 
nltk.download("stopwords") 

## Import Data

In [None]:
df = pd.read_csv("../input/toxic-tweets-dataset/FinalBalancedDataset.csv")
df = df.drop(columns = ['Unnamed: 0'])
df.head()

## Data Visualization (ALL)

In [None]:
import wordcloud
from wordcloud import WordCloud
allWords = ' '.join([twts for twts in df['tweet']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)

plt.figure(figsize = (10, 8))
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## Visualize Toxic Tweets

In [None]:
allWords = ' '.join([twts for twts in df[df['Toxicity'] == 1]['tweet']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)

plt.figure(figsize = (10, 8))
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## Visualize Non-Toxic Tweets

In [None]:
allWords = ' '.join([twts for twts in df[df['Toxicity'] == 0]['tweet']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)

plt.figure(figsize = (10, 8))
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## Cleaning Data

In [None]:
X = df['tweet'].copy()
y = df['Toxicity'].copy()

In [None]:
def data_cleaner(tweet):
    tweet = tweet.lower()
    tweet = tweet.replace(":("," sedih")
    tweet = tweet.replace(":)"," senang")
    tweet = tweet.replace(":3"," lucu")
    tweet = tweet.replace(":d"," senang")
    tweet = tweet.replace(":-)"," senang")
    tweet = tweet.replace("=)"," senang")
    tweet = re.sub(r'http\S+', ' ', tweet)   # remove urls
    tweet = re.sub(r'<.*?>',' ', tweet)      # remove html tags
    tweet = re.sub(r'\d+',' ', tweet)        # remove digits
    tweet = re.sub(r'#\w+',' ', tweet)       # remove hashtags
    tweet = re.sub(r'@\w+',' ', tweet)       # remove mentions
    tweet = re.sub(r'[^\w\s]',' ', tweet)    # remove punctuation
    tweet = re.sub('[^A-Za-z0-9 ]+', '', tweet) # remove characters that are not a letters or numbers
    tweet = " ".join([word for word in tweet.split() if not word in stop_words])   # remove stop words
    tweet = stemmer.stem(tweet)
    tweet = tweet.strip()
    return tweet

In [None]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer("english")

#X.apply(data_cleaner)
X_cleaned = X.apply(data_cleaner)
X_cleaned

## Tokenizing

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_cleaned)
X = tokenizer.texts_to_sequences(X_cleaned)
vocab_size = len(tokenizer.word_index) + 1

print("Vocabulary size: {}".format(vocab_size))
print("\nExample:\n")
print("Sentence:\n{}".format(X_cleaned[0]))
print("\nAfter tokenizing :\n{}".format(X[0]))

X = pad_sequences(X, padding='post')
print("\nAfter padding :\n{}".format(X[0]))

## Check Distribution of Class

In [None]:
sns.countplot(x = "Toxicity",data = df)

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

## Model Building and Training

In [None]:
tf.keras.backend.clear_session()

# hyperparameters
EPOCHS = 2
BATCH_SIZE = 64
embedding_dim = 64
units = 256

model = tf.keras.Sequential([
    L.Embedding(vocab_size, embedding_dim, input_length=X.shape[1]),
    L.Bidirectional(L.GRU(units, return_sequences=True)),
    L.GlobalMaxPool1D(),
    L.Dropout(0.4),
    L.Dense(512, activation="sigmoid"),
    L.Dropout(0.4),
    L.Dense(2, activation = "softmax")
])

model.compile(
    loss=SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=2, validation_data = (X_test,y_test), batch_size=BATCH_SIZE)

After 2 epochs, we will get overfitting model

## Predict Data Test

In [None]:
predicted = model.predict_classes(X_test)
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss: {}'.format(loss))
print('Test Accuracy: {}'.format(acc))

### Confusion Matrix

In [None]:
conf = confusion_matrix(y_test, predicted)

labels = ['Non-Toxic','Toxic']

cm = pd.DataFrame(
    conf, index = [i for i in labels],
    columns = [i for i in labels]
)

sns.heatmap(cm, annot=True, fmt="d")
plt.show()

### Classification Report

In [None]:
print(classification_report(y_test, predicted, target_names=labels))