In [0]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup  #to scrape all the XML etc from twitter

from google.colab import drive

In [26]:
try:
  %tensorflow_version 2.x  #force tf2 to be used
except Exception:
  pass
import tensorflow as tf

from tensorflow.keras import layers  
import tensorflow_datasets as tfds  #We can make use of a tokenizer that is available

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x  #force tf2 to be used`. This will be interpreted as: `2.x`.


TensorFlow is already loaded. Please restart the runtime to change versions.


In [27]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
os.chdir('/content/drive/My Drive/trainingandtestdata')

In [29]:
os.getcwd()

'/content/drive/My Drive/trainingandtestdata'

In [0]:
columns = ["sentiment", "id", "date", "query", "user", "text"]

train_data = pd.read_csv(
      "train.csv",
      header = None,
      names = columns,
      engine = "python",
      encoding = "latin1"
)


test_data = pd.read_csv(
      "test.csv",
      header = None,
      names = columns,
      engine = "python",
      encoding = "latin1"
)

In [31]:
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [32]:
train_data.tail()

Unnamed: 0,sentiment,id,date,query,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [0]:
data = train_data

In [0]:
data.drop(["id", "date", "query", "user"],
          axis = 1,
          inplace = True)

In [0]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, "lxml").get_text() #Get the text from the xml form it is imported in
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) # Removing the @
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet) # Removing the URL links
  tweet = re.sub(r"^[A-Za-z.!?']", ' ', tweet) # Keeping only letters
  tweet = re.sub(r" +", ' ', tweet) # Removing additional whitespaces
  return tweet

In [51]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 17330851999792843182, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 14968510682498025884
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 13759722370256752491
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11150726272
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 1663793284795157824
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"]

In [52]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [0]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [0]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [0]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size = 2**16
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [0]:
#Padding
max_len = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            padding = 'post',
                                                            value = 0,
                                                            maxlen = max_len)

In [0]:
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx + 800000))

In [0]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis = 0)
train_labels = np.delete(data_labels, test_idx)

In [0]:
class DCNN(tf.keras.Model):

  def __init__(self,
               vocab_size,
               emb_dim = 128,
               nb_filters = 50,
               FFN_units = 512,
               nb_classes = 2,
               dropout_rate = 0.1,
               training = False,
               name = "dcnn"):
    super(DCNN, self).__init__(name = name)

    self.embedding = layers.Embedding(vocab_size, emb_dim)

    self.bigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 2,
                                padding = "valid",
                                activation = 'relu')
    
    self.trigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 3,
                                padding = "valid",
                                activation = 'relu')

    self.fourgram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 4,
                                padding = "valid",
                                activation = 'relu')     

    self.pool = layers.GlobalMaxPool1D()

    self.dense_1 = layers.Dense(units = FFN_units, activation = 'relu')
    self.dropout = layers.Dropout(rate = dropout_rate)
    if nb_classes == 2:
      self.last_dense = layers.Dense(units = 1,
                                     activation = 'sigmoid')
    else:
      self.last_dense = layers.Dense(units = nb_classes,
                                     activation = 'softmax')
      

  def call(self, inputs, training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool(x_3)        #Global max pooling leaves back (1, nb_filters), and there are batch_size number of inputs one by one in dim[0]

    merged = tf.concat([x_1, x_2, x_3], axis = -1) #Along the last axis, dim = (batch_size, 3 * nb_filters)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output

In [0]:
#Configuration of the input model

VOCAB_SIZE = tokenizer.vocab_size 
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [0]:
Dcnn = DCNN(vocab_size = VOCAB_SIZE,
            emb_dim = EMB_DIM,
            nb_filters = NB_FILTERS,
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE)

In [0]:
if NB_CLASSES == 2:
  Dcnn.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics = ['accuracy']
              )
  
else:
  Dcnn.compile(loss = 'sparse_categorical_crossentropy',
               optimizer = 'adam',
               metrics = ['sparse_categorical_accuracy'])  

In [0]:
checkpoint_path = 'trainingandtestdata/ckpt'

ckpt = tf.train.Checkpoint(Dcnn = Dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 5)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Checkpoint restored")

In [55]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [56]:
Dcnn.fit(
    train_inputs,
    train_labels,
    batch_size = BATCH_SIZE,
    epochs = 1
)

ckpt_manager.save()



'trainingandtestdata/ckpt/ckpt-1'

In [57]:
#Evaluation

results = Dcnn.evaluate(test_inputs, test_labels, batch_size = BATCH_SIZE)
print(results)

[0.38312792778015137, 0.8307499885559082]


In [58]:
Dcnn(np.array([tokenizer.encode("I hate you")]), training = False).numpy()

array([[0.46457976]], dtype=float32)

In [61]:
Dcnn(np.array([tokenizer.encode("I am so happy")]), training = False).numpy()

array([[0.94309497]], dtype=float32)

In [63]:
Dcnn(np.array([tokenizer.encode("I am tired")]), training = False).numpy()

array([[0.46457976]], dtype=float32)

In [60]:
Dcnn(np.array([tokenizer.encode("I do not want to go through this again bro")]), training = False).numpy()

array([[0.017404]], dtype=float32)

In [62]:
tokenizer.encode("bad")

[510]