In [3]:
import string
import pandas as pd
import nltk
nltk.download('stopwords',quiet=True)
from nltk.corpus import stopwords
nltk.download('punkt',quiet=True)
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from transformers import AutoTokenizer,TFBertModel
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

In [4]:
#Model Training
import os
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, GlobalMaxPooling1D, Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Flatten, BatchNormalization, Input
from sklearn import metrics
import seaborn as sns

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
#import data
df_train=pd.read_csv('twittertrain.csv')
df_test=pd.read_csv('twittertest.csv')

In [6]:
df_test.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
df_train['keyword']=df_train['keyword'].fillna('')

In [9]:
long_string=' '.join(df_train['keyword'])

In [10]:
#Preprocessing
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [11]:
# Remove Punctuation
punc_list = list(string.punctuation)
def remove_punctuation(text, punct_list):
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ')
    return text

In [12]:
df_train2=df_train.copy()
df_train2['text']=df_train2['text'].apply(lambda x:remove_punctuation(x,punc_list))
display(df_train2.head(5))
df_test2=df_test.copy()
df_test2['text']=df_test2['text'].apply(lambda x:remove_punctuation(x,punc_list))
display(df_test2.head(5))

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this earthquake M...,1
1,4,,,Forest fire near La Ronge Sask Canada,1
2,5,,,All residents asked to shelter in place are ...,1
3,6,,,13 000 people receive wildfires evacuation or...,1
4,7,,,Just got sent this photo from Ruby Alaska as ...,1


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,Heard about earthquake is different cities s...
2,3,,,there is a forest fire at spot pond geese are...
3,9,,,Apocalypse lighting Spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [13]:
#text tokenizer
tokenizer=AutoTokenizer.from_pretrained('bert-large-uncased')

#bert model
bert=TFBertModel.from_pretrained('bert-large-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [14]:
#tokenize training dataset
x_train=tokenizer(text=df_train2.text.tolist(),
                  padding='longest', #set padding to the logest text so we ensure all text are tokenized properly
                  return_tensors='tf',#for tensorlow, if like PyTorch you can set up pt
                  return_token_type_ids=False, #we don't need the token_type_ids in this case
                  return_attention_mask=True,#but we do need the attention mask
                  verbose=True)
print(x_train['input_ids'].shape,x_train['attention_mask'].shape)

(7613, 55) (7613, 55)


In [15]:
#input_ids
input_ids=x_train['input_ids']

#attention mask
input_mask=x_train['attention_mask']

In [16]:
#the first three input_ids
print(input_ids[0:3])

tf.Tensor(
[[  101  2256 15616  2024  1996  3114  1997  2023  8372  2089 16455  9641
   2149  2035   102     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [  101  3224  2543  2379  2474  6902  3351 21871  2243  2710   102     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [  101  2035  3901  2356  2000  7713  1999  2173  2024  2108 19488  2011
   3738  2053  2060 13982  2030  7713  1999  2173  4449  2024  3517   102
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0    

In [17]:
#the first three attention_mask
print(input_mask[0:3])

tf.Tensor(
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(3, 55), dtype=int32)


In [18]:
#first three lines of cleaned text
print(df_train2['text'][0])
print(df_train2['text'][1])
print(df_train2['text'][2])

Our Deeds are the Reason of this  earthquake May ALLAH Forgive us all
Forest fire near La Ronge Sask  Canada
All residents asked to  shelter in place  are being notified by officers  No other evacuation or shelter in place orders are expected


In [19]:
y_train=df_train2.target.values
y_train

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
#define the input_ids and attention_mask as input layers
input_ids=Input(shape=(55,),dtype=tf.int32,name="input_ids")
input_mask=Input(shape=(55,),dtype=tf.int32,name="attention_mask")

#feed the input layers into the bert model
embeddings=bert(input_ids,attention_mask=input_mask)[1]#0 is the last hidden states, 1 means pooler_output

#dropout layer to prevent overfitting
out=tf.keras.layers.Dropout(0,1)(embeddings)

#extra hidden layer to learn
out=Dense(128,activation='relu')(out)

#dropout layer to prevent overfitting
out=tf.keras.layers.Dropout(0,1)(out)

#output layer
y=Dense(1,activation='sigmoid')(out)


#define input and output layer for model
model=tf.keras.Model(inputs=[input_ids,input_mask],outputs=y)

#unfreeze the bert model parameters so the model can adapt to this current task
model.layers[2].trainable=True

#model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 55)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  335141888   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                            

In [1]:
#compile the model
model.compile(optimizer=Adam(learning_rate=3e-05),
              loss=BinaryCrossentropy(),
              metrics=BinaryAccuracy('accuracy'))

#train the model
final=model.fit(x={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']},
                y=y_train,
                validation_split=0.2,
                epochs=2,
                batch_size=32,
                shuffle=True)


NameError: ignored

In [21]:
#model.save_weights("bert_based.h5")

In [22]:
#DistilBERT tokenizer
distilTokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

#distilBERT model
distilBert=TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=1)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [23]:
#tokenize our training dataset
x_train=distilTokenizer(text=df_train2.text.tolist(),
                        padding='longest',#set padding to the longest text so we ensure all text are tokenized properly
                        return_tensors='tf',#fro tensorflow
                        return_token_type_ids=False, #we don't need the token_type_ids in this case
                        return_attention_mask=True,#but we do need the attention mask
                        verbose=True)

print(x_train['input_ids'].shape,x_train['attention_mask'].shape)

(7613, 55) (7613, 55)


In [24]:
#difine the input layers
distil_input_ids=Input(shape=(None,),dtype=tf.int32,name='input_ids')
distil_attention_mask=Input(shape=(None,),dtype=tf.int32,name='attention_mask')

#define the model inputs
inputs={'input_ids':distil_input_ids,'attention_mask':distil_attention_mask}

#difine the model outputs
outputs=distilBert(inputs)
logits=outputs.logits

#define the model
distil_model=tf.keras.Model(inputs=inputs,outputs=logits)

#model summary
distil_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 tf_distil_bert_for_sequence_cl  TFSequenceClassifie  66954241   ['attention_mask[0][0]',         
 assification (TFDistilBertForS  rOutput(loss=None,               'input_ids[0][0]']              
 equenceClassification)         logits=(None, 1),                                                 
                                 hidden_states=None                                         

In [25]:
#compile the model
distil_model.compile(optimizer=Adam(learning_rate=3e-5),
                     loss=BinaryCrossentropy(from_logits=True),
                     metrics=BinaryAccuracy('accuracy'))

#train the model
distil_final=distil_model.fit(
    x={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']},
    y=y_train,
    validation_split=0.2,
    epochs=2,
    batch_size=32,
    shuffle=True)

Epoch 1/2
Epoch 2/2


In [26]:
#distil_model.save_weights("distil_model.h5")