In [None]:
#import all dependencies
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random
from google.colab import drive

In [None]:
#download bert packages
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/35/5c/6439134ecd17b33fe0396fb0b7d6ce3c5a120c42a4516ba0e9a2d6e43b25/bert-for-tf2-0.14.4.tar.gz (40kB)
[K     |████████                        | 10kB 28.7MB/s eta 0:00:01[K     |████████████████▏               | 20kB 2.8MB/s eta 0:00:01[K     |████████████████████████▎       | 30kB 3.8MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 2.7MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [None]:
try:
  %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Input
from keras.layers import Bidirectional, GlobalMaxPool1D, LSTM, GRU, SimpleRNN
from keras.initializers import Constant
from keras.models import Model
import matplotlib.pyplot as plt
import os
import sys

Using TensorFlow backend.


# loading files

In [None]:
#get data from personal google drive folders
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/My Drive/twitterfakenews/train.csv", engine="python", encoding = "latin1")

In [140]:
test = pd.read_csv("/content/drive/My Drive/twitterfakenews/test.csv", engine="python", encoding = "latin1")

In [None]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
data.drop(["keyword", "location", "id"], axis = 1, inplace = True)

In [None]:
test.drop(["keyword", "location"], axis = 1, inplace = True)

In [None]:
data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


# Cleaning tweets

In [None]:
#clean tweet
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, "lxml").get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) #remove mentions
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
  tweet = re.sub(r" +", ' ', tweet)
  tweet = tweet.lower()
  return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.target.values #get in numpy format

In [None]:
test_clean = [clean_tweet(tweet) for tweet in test.text]

# Tokenization

In [None]:
Fulltokenizer = bert.bert_tokenization.FullTokenizer
#bert layer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/1", trainable = False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = Fulltokenizer(vocab_file, do_lower_case)

In [None]:
tokenizer.tokenize("Samantha loves final fantasy")

['[UNK]', 'love', '##s', 'final', 'fantasy']

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("Samantha loves final fantasy"))

[100, 8451, 8118, 10591, 12436]

In [None]:
def encode_sentence(sent):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sent) for sent in data_clean]

# Dataset creation

In [None]:
#padding tokens per batch all the sentences do not need to have the same length
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
#random.shuffle(data_with_len) #not always necessary
data_with_len.sort(key = lambda x: x[2]) #sort based on length get access to the third element
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len] #remove short sentences

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, output_types=(tf.int32, tf.int32))

In [None]:
all_test = tf.data.Dataset.from_generator(lambda: test_inputs, output_types=(tf.int32, tf.int32))

In [None]:
next(iter(all_dataset))

(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([10353,  9558], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
NB_BATCHES =math.ceil(len(sorted_all)/BATCH_SIZE)

In [None]:
NB_BATCHES

238

In [None]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 4), dtype=int32, numpy=
 array([[10353,  9558,     0,     0],
        [ 8811, 10841,     0,     0],
        [ 8413, 10447,     0,     0],
        [12139,  8542,     0,     0],
        [10451,  8180,     0,     0],
        [ 9193,  8451,     0,     0],
        [ 9978,  8310, 11338,     0],
        [ 8792,  8902,  8798,     0],
        [ 8174,  9931,   106,     0],
        [10100,  8613,  9069,     0],
        [10100,  8613,  9069,     0],
        [ 9867, 13214,  8118,     0],
        [13158,  8197,   163,     0],
        [10951, 12670,  8303,     0],
        [ 8363, 11486,  8118,     0],
        [ 8363, 11486,  8118,     0],
        [10092,  8179, 11229,     0],
        [12289,  9917,  8863,     0],
        [12289,  9917,  8863,     0],
        [12289,  9917,  8863,     0],
        [12605,  8458,  8487,     0],
        [  151, 10235, 11229,     0],
        [ 8450,  9238, 11140,     0],
        [11515, 11714,  8221,     0],
        [11515, 11714,  8221,     0],
  

In [None]:
NB_BATCHES_TEST = NB_BATCHES //10

In [None]:
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Model Building

In [None]:
class LSTM(tf.keras.Model):
  
  def __init__(self, vocab_size,
               emb_dim = 128,
               LSTM_units = 50,
               dropout_rate = 0.1,
               training = False,
               name = "lstm"):
    super(LSTM, self).__init__(name=name)

    self.embedding = layers.Embedding(vocab_size, emb_dim)
    self.dense_1 = layers.Bidirectional(layers.LSTM(LSTM_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
    self.pool = layers.GlobalAveragePooling1D()
    self.dropout = layers.Dropout(rate =dropout_rate)
    self.last_dense = layers.Dense(units =1, activation= "sigmoid")

  def call(self, inputs, training):
    x = self.embedding(inputs)
    x_1 = self.dense_1(x)
    x_2 = self.pool(x_1)
    x_3 = self.dropout(x_2)
    output = self.last_dense(x_3)

    return output


In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
LSTM_UNITS = 50
NB_CLASSES = 2
DROPOUT_RATE = 0.1
NB_EPOCHS = 10

In [None]:
lstm = LSTM(vocab_size= VOCAB_SIZE,
            emb_dim = EMB_DIM,
            LSTM_units = LSTM_UNITS,
            dropout_rate = DROPOUT_RATE)













In [None]:
Dcnn.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])


# Results

In [None]:
lstm.fit(train_dataset, epochs = NB_EPOCHS, validation_split = .1)

In [None]:
def get_prediction(sent):
  tokens = encode_sentence(sent)
  inputs = tf.expand_dims(tokens, 0)

  output = Dcnn(inputs, training = False)

  target = math.floor(output*2)
  return output

In [123]:
range(len(preds))

range(0, 3263)

In [None]:
preds = [get_prediction(sent) for sent in test_clean]

In [114]:
y_preds = []
for i in range(len(preds)): 
  y_hat = preds[i].numpy()
  y_preds.append(y_hat)

[<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.9999869]], dtype=float32)>]

In [133]:
y_hat = np.stack(y_preds, axis=0)

In [137]:
y_pred = np.matrix(y_hat)

In [138]:
pred = pd.DataFrame(y_pred)

In [144]:
pred = pred.T

In [145]:
ids = pd.DataFrame(test[['id']])
df = pd.concat([ids, pred], axis = 1)
df.columns = ['id', 'target']
df.set_index('id', inplace=True)

In [146]:
df['target'] = df['target'].apply(lambda x: 0 if x < .5 else 1)

In [148]:
df.head

<bound method NDFrame.head of        target
id           
0           1
2           1
3           1
9           1
11          1
...       ...
10861       1
10865       1
10868       1
10874       1
10875       1

[3263 rows x 1 columns]>

In [147]:
#save
df.to_csv("/content/drive/My Drive/twitterfakenews/mysubmissio_bert.csv")

scores 0.74961 which is not bad