In [None]:
!pip install -q -U tensorflow-text
!pip install -q tf-models-official
!pip install tensorflow_hub

# **Note:** 
It is not recommended to remove features like stopwords, numbers, repeating characters, Stem & Lemm, and punctuations for the BERT model. Removing these can be a bad idea for the BERT model because BERT is a pre-trained model and these features can give a negative impact on the accuracy. Because single punctuation or repeating characters or words also shows some emotions. You can always do experiments to check which features give what impact, it will vary based on the dataset.

In [None]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import wordcloud
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from keras import backend as K
from transformers import AutoTokenizer,TFBertModel
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import CategoricalAccuracy, BinaryAccuracy
from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy


sns.set_style("whitegrid")

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

In [None]:
train = train.drop(['keyword','location'], axis = 1)
test = test.drop(['keyword','location'], axis = 1)
train.head()

In [None]:
# Checking Shape of Train and Test sets:
print("Shape of Train set:", train.shape)
print("Shape of Test set:", test.shape)

# Labels are as follows:
label '1' ---> racist/sexist tweet           
label '0' ---> not racist/sexist tweet

In [None]:
df = train.copy()
df['target'].value_counts()

# 1. Model without removing any feature:

### Splitting data into Train and Test sets

In [None]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

# BERT 
### Base Model with Neural Networks:

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/4")

In [None]:
# Checking array created using BERT:
def get_sentence_embedding(sentences):
  preprocessed_text = bert_preprocess(sentences)
  return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embedding(["You are noob.","What are you looking at?"])

In [None]:
# Bert layers:
num_classes = 2
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers:
l = tf.keras.layers.Dropout(0.2, name='dropout')(outputs['pooled_output'])
l = tf.keras.layers.Dense(num_classes, activation='sigmoid', name='output')(l)

# Construct final model:
model = tf.keras.Model(inputs=[text_input], outputs=[l])

model.summary()

# You can use these METRICS as well. If you are using this then change 'metrics=METRICS' in 'model.compile' section.
# METRICS = [
#            tf.keras.metrics.BinaryCrossentropy(name='accuracy'),
#            tf.keras.metrics.Precision(name='precision'),
#            tf.keras.metrics.Recall(name='recall')
# ]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
#Ploting Model Architecture:
tf.keras.utils.plot_model(model)

### Training model:

In [None]:
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

### Results:

In [None]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

### Accuracy vs Loss:

In [None]:
# This builds a graph with all the available metrics of the history.

pd.DataFrame(history.history).plot(figsize=(10,6))
plt.show()

# 2. Model after removing stopwords:

In [None]:
df = train.copy()

### Removing Stopwords:

In [None]:
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Splitting data into Train and Test sets

In [None]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

### Training model:

In [None]:
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

### Results:

In [None]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

### Accuracy vs Loss:

In [None]:
# This builds a graph with all the available metrics of the history.

pd.DataFrame(history.history).plot(figsize=(10,6))
plt.show()

# 3. Model after removing repeating characters:

In [None]:
df = train.copy()

### Removing repeating characteres:

In [None]:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

df.text = df.text.apply(lambda tweet: reduce_sequence_tweet(tweet))

# Detokenizing tweets:

def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

df['text'] = df['text'].apply(lambda x: listToString(x))

### Splitting data into Train and Test sets

In [None]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

### Training model:

In [None]:
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

### Results:

In [None]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

### Accuracy vs Loss:

In [None]:
# This builds a graph with all the available metrics of the history.

pd.DataFrame(history.history).plot(figsize=(10,6))
plt.show()

# 4. Model after removing Punctuations:

In [None]:
df = train.copy()

### Removing Punctuations:

In [None]:
import string
string.punctuation

In [None]:
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Splitting data into Train and Test sets

In [None]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

### Training model:

In [None]:
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

### Results:

In [None]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

### Accuracy vs Loss:

In [None]:
# This builds a graph with all the available metrics of the history.

pd.DataFrame(history.history).plot(figsize=(10,6))
plt.show()

# 5. Model after removing numbers:

In [None]:
df = train.copy()

### Removing numbers:

In [None]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['text'] = df['text'].apply(lambda text: cleaning_numbers(text))

### Splitting data into Train and Test sets

In [None]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

### Training model:

In [None]:
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

### Results:

In [None]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

### Accuracy vs Loss:

In [None]:
# This builds a graph with all the available metrics of the history.

pd.DataFrame(history.history).plot(figsize=(10,6))
plt.show()

# Setup 6: Applying Stemming and Lemmatization:

In [None]:
df = train.copy()

### Applying Stemming: 

In [None]:
# Tokenizing tweets:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

stemm = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test :

In [None]:
y = tf.keras.utils.to_categorical(df['target'].astype(str), num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'].astype(str), y, test_size=0.2, random_state=3)

### Training model:

In [None]:
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

# Results:

In [None]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

### Accuracy vs Loss:

In [None]:
# This builds a graph with all the available metrics of the history.

pd.DataFrame(history.history).plot(figsize=(10,6))
plt.show()

# 7. Models after removing all the features:

In [None]:
df = train.copy()

### Removing Punctuations:

In [None]:
import string
string.punctuation

In [None]:
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Removing Stopwords:

In [None]:
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Removing Numbers:

In [None]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['text'] = df['text'].apply(lambda text: cleaning_numbers(text))

### Removing repeating characters:

In [None]:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

df.text = df.text.apply(lambda tweet: reduce_sequence_tweet(tweet))

# Detokenizing tweets:

def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

df['text'] = df['text'].apply(lambda x: listToString(x))

### Applying Stemming: 

In [None]:
# Tokenizing tweets:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

stemm = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test sets

In [None]:
y = tf.keras.utils.to_categorical(df['target'].astype(str), num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'].astype(str), y, test_size=0.2, random_state=3)

### Training model:

In [None]:
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

### Results:

In [None]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

### Accuracy vs Loss:

In [None]:
# This builds a graph with all the available metrics of the history.

pd.DataFrame(history.history).plot(figsize=(10,6))
plt.show()