Since now we have established technques to handle data and preprocess it in a efficient manner , now we shall explore the world of NLP beginning with sentiment analysis on the twitter dataset.

In [None]:
%%bash
cd ../working/
ls

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np

# Reading Data

In [None]:
train = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/train.csv')
test = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')

# Pre Processing

Since the classes are highly imbalanced so we create a train dataset which will contain only a random sample of equally distributed hateful and non hateful tweets

In [None]:
hateful = train[train['label'] == 1]
non_hateful = train[train['label'] == 0].sample(n = 2242)

In [None]:
final_data = hateful.append(non_hateful).sample(2242*2)

In [None]:
display(final_data)
display(final_data.describe())
display(final_data.info())

# Train Test Split

In [None]:
def train_test_split(data,train_split = 0.8):
    train_mask = data.apply(lambda x : (abs(hash(str(x['id']))) % 10000) < train_split * 10000 ,axis = 1)
    eval_mask = data.apply(lambda x : (abs(hash(str(x['id']))) % 10000) >= train_split * 10000 ,axis = 1)
    data.loc[train_mask,:].to_csv('train_data.csv')
    data.loc[eval_mask,:].to_csv('eval_data.csv')
    
train_test_split(train)

# Creating Data Generators

In [None]:
def generate_train_dataset(train_batch_size,epochs):
    return tf.data.experimental.make_csv_dataset(
    ['train_data.csv'],
    train_batch_size,
    label_name='label',
    num_epochs=epochs)
    
def generate_eval_dataset(eval_batch_size,epochs):
    return tf.data.experimental.make_csv_dataset(
    ['eval_data.csv'],
    eval_batch_size,
    label_name='label',
    num_epochs=epochs)

# Tensor flow hub

In the world of NLP , we need to convert our words or sentences into embeddings. Embeddings are a numerical representations of the words or sentences. We can traun our own embeddings but these require a lot of textual data and computational resources. Tensor flow provides us with pre trained embeddings which can be downloaded from https://tfhub.dev/. We will mostly use these pre trained embeddings in our code. These embeddings have been trained on a large and varried texxtual corpus and provided as open source resources.

In [None]:
import os
os.environ["TFHUB_CACHE_DIR"] = '../working/'

import tensorflow_hub as hub
import tensorflow as tf

These lines of code show us how the embedding layers will convert the text into a numerical format.

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings)

# Generatinng feature columns

In [None]:
def generate_feature_column():
    return [
       hub.text_embedding_column_v2("tweet",'https://tfhub.dev/google/universal-sentence-encoder/4')
    ]

code to demonstrate the feature columns and their preprocessing of the dataset

In [None]:
example_batch = next(iter(generate_train_dataset(10,1)))[0]

def demo(feature_column):
  feature_layer = tf.keras.layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

In [None]:
demo(generate_feature_column())

# Model Training

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.DenseFeatures(generate_feature_column()),
    tf.keras.layers.Dense(16,activation = 'relu'),
    tf.keras.layers.Dense(1)
])


model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.01), loss = tf.keras.losses.BinaryCrossentropy(from_logits = True), metrics=["acc"])
model.fit(generate_train_dataset(10,5) ,epochs=5,verbose = 1,workers=-1,batch_size = 10)

# Evaluation

In [None]:
model.evaluate(generate_eval_dataset(10,1))

# Predictions

In [None]:
def generate_test_dataset(eval_batch_size,epochs):
    data = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')
    dataset = tf.data.Dataset.from_tensor_slices(dict(data))
    return dataset.batch(eval_batch_size).repeat(epochs)

In [None]:
output = model.predict(generate_test_dataset(10,1))

In [None]:
output = np.array(list(map(lambda x : 1 if x > 0.8 else 0 ,output)))

In [None]:
data_temp = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')
data_temp['label'] = pd.Series(output)
data_temp.to_csv('predictions.csv')