In [1]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import csv
import logging
import torch.nn as nn
from torch import optim
import torch.nn.functional as Func
import matplotlib.pyplot as plt
import scipy
import random
from ipyleaflet import Map, Marker
from geopy.geocoders import Nominatim

In [2]:
# check device available

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1050 Ti


In [3]:
# load data and divide into training, validation, and testing dataset
file = pd.read_csv('https://raw.githubusercontent.com/khordoo/disaster-watch-classifier/master/datasets/combined.csv')
tweet = []
label = []
cat = []

for i in range(len(file)):
    tweet.append(file.iloc[i][2])
    label.append(file.iloc[i][3])
    cat.append(file.iloc[i][4])
    
labels = np.zeros(len(file))
d_type = np.zeros(len(file))
l = []
for i in range(len(file)):
    if label[i] == 'on-topic':
        labels[i] = 1
    else:
        labels[i] = 0
    if cat[i] in l:
        d_type[i] = l.index(cat[i])
    else:
        l.append(cat[i])
        d_type[i] = l.index(cat[i])
r = random.sample(range(len(tweet)), len(tweet))
tweet_tr = []
label_tr = []
cat_tr = []
tweet_vad = []
label_vad = []
cat_vad = []
tweet_te = []
label_te = []
cat_te = []
for i in range(len(r)):
    if i < 500:
        tweet_vad.append(tweet[r[i]].lower())
        label_vad.append(labels[r[i]])
        cat_vad.append(d_type[r[i]])
    elif i >= 500 and i < round(len(r)*0.17):
        tweet_te.append(tweet[r[i]].lower())
        label_te.append(labels[r[i]])
        cat_te.append(d_type[r[i]])
    else:
        tweet_tr.append(tweet[r[i]].lower())
        label_tr.append(labels[r[i]])
        cat_tr.append(d_type[r[i]])


label_tr = np.array(label_tr)
cat_tr = np.array(cat_tr)
label_vad = np.array(label_vad)
cat_vad = np.array(cat_vad)
label_te = np.array(label_te)
cat_te = np.array(cat_te)
label_tr = label_tr.reshape(len(label_tr),1)
cat_tr_0 = np.zeros(len(cat_tr))
cat_tr_1 = np.zeros(len(cat_tr))
cat_tr_2 = np.zeros(len(cat_tr))
cat_tr_3 = np.zeros(len(cat_tr))
cat_tr_4 = np.zeros(len(cat_tr))
cat_tr_5 = np.zeros(len(cat_tr))
for i in range(len(cat_tr)):
    if cat_tr[i] == 0:
        cat_tr_0[i] = 1  
    elif cat_tr[i] == 1:
        cat_tr_1[i] = 1  
    elif cat_tr[i] == 2:
        cat_tr_2[i] = 1
    elif cat_tr[i] == 3:
        cat_tr_3[i] = 1
    elif cat_tr[i] == 4:
        cat_tr_4[i] = 1
    else:
        cat_tr_5[i] = 1

In [4]:
# build and initilize the model, optimizer

class BERT_CUS(nn.Module):
    def __init__ (self):
        super(BERT_CUS, self).__init__()
        self.l0 = BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        
        self.l1 = nn.Sequential(
            nn.Dropout(0.3),
            #nn.BatchNorm1d(768),
            nn.Linear(768, 6),
            nn.Sigmoid()
        )
        
        self.l2 = nn.Sequential(
            nn.Dropout(0.3),
            #nn.BatchNorm1d(768),
            nn.Linear(768, 1),
            nn.Sigmoid()
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids):
            _, output_0= self.l0(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
            output1 = self.l1(output_0)
            output2 = self.l2(output_0)
            return output1, output2
from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast=False, max_length=191)
model = BERT_CUS()  
for param in model.l0.parameters():
    param.requires_grad = True
model.to(device)
optimizer = torch.optim.Adam(params = model.parameters(), lr=1e-5)

In [5]:
# train the model

B = 5
EPOCH = 3
L = 3
cat_loss = nn.CrossEntropyLoss()
label_loss = nn.BCELoss()

for e in range(EPOCH):
    for i in range(0, len(tweet_tr), B):
        optimizer.zero_grad()
        encoded = tokenizer(tweet_tr[i:i+B], return_tensors='pt', padding= True)
        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)
        token_type_ids = encoded['token_type_ids'].to(device)
        cat_pre,label_pre = model(input_ids, attention_mask, token_type_ids)
        l0 = label_loss(cat_pre[:,0], torch.tensor(cat_tr_0[i:i+B]).float().to(device))
        l1 = label_loss(cat_pre[:,1], torch.tensor(cat_tr_1[i:i+B]).float().to(device))
        l2 = label_loss(cat_pre[:,2], torch.tensor(cat_tr_2[i:i+B]).float().to(device))
        l3 = label_loss(cat_pre[:,3], torch.tensor(cat_tr_3[i:i+B]).float().to(device))
        l4 = label_loss(cat_pre[:,4], torch.tensor(cat_tr_4[i:i+B]).float().to(device))
        l5 = label_loss(cat_pre[:,5], torch.tensor(cat_tr_5[i:i+B]).float().to(device))
        ll = label_loss(label_pre, torch.tensor(label_tr[i:i+B]).float().to(device))
        loss = L*(l0 + l1 + l2 + l3 + l4 + l5) + ll
        loss.backward()
        optimizer.step()
        
    acc_cat = 0
    acc_label = 0
        
    for i in range(0, 500, B):
        encoded = tokenizer(tweet_vad[i:i+B], return_tensors='pt', padding= True)
        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)
        token_type_ids = encoded['token_type_ids'].to(device)
        cat_pre,label_pre = model(input_ids, attention_mask, token_type_ids)
        for j in range(B):
            if(cat_vad[i+j] == torch.argmax(cat_pre[j])):
                acc_cat += 1/500
            if(label_vad[i+j] == torch.round(label_pre)[j]):
                acc_label += 1/500
    print(e)
    print("cat acc during vad:", acc_cat)
    print("label acc during vad:", acc_label)

0
cat acc during vad: 0.7700000000000006
label acc during vad: 0.9120000000000007
1
cat acc during vad: 0.8000000000000006
label acc during vad: 0.9220000000000007
2
cat acc during vad: 0.7740000000000006
label acc during vad: 0.9240000000000007


In [7]:
    # test the performance on the testing dataset
    
    acc_cat = 0
    acc_label = 0
        
    for i in range(0, len(tweet_te)):
        encoded = tokenizer(tweet_te[i], return_tensors='pt', padding= True)
        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)
        token_type_ids = encoded['token_type_ids'].to(device)
        cat_pre,label_pre = model(input_ids, attention_mask, token_type_ids)
        if(cat_te[i] == torch.argmax(cat_pre)):
            acc_cat += 1/len(tweet_te)
        if(label_te[i] == torch.round(label_pre)):
            acc_label += 1/len(tweet_te)
            

In [10]:
print("cat acc during testing:", acc_cat)
print("label acc during testing:", acc_label)

cat acc during testing: 0.8009596587879542
label acc during testing: 0.9082992713701423


In [13]:
TP = 0
TN = 0
FN = 0
FP = 0
mertics = np.zeros((6, 3)) #tp, fp, fn

for i in range(0, len(tweet_te)):
        encoded = tokenizer(tweet_te[i], return_tensors='pt', padding= True)
        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)
        token_type_ids = encoded['token_type_ids'].to(device)
        cat_pre,label_pre = model(input_ids, attention_mask, token_type_ids)
        if(cat_te[i] == torch.argmax(cat_pre)):
            mertics[int(cat_te[i])][0] += 1
        else:
            mertics[int(cat_te[i])][1] += 1
            mertics[int(torch.argmax(cat_pre).item())][2] += 1
            
        if(label_te[i] == torch.round(label_pre) and label_te[i] > 0):
            TP += 1
        elif(label_te[i] == torch.round(label_pre) and label_te[i] == 0):
            TN += 1
        elif(label_te[i] != torch.round(label_pre) and label_te[i] > 0):
            FN += 1
        else:
            FP += 1
        

In [17]:
label_precision = TP / (TP + FP)
label_recall = TP / (TP + FN)
cat_0_precision = mertics[0][0] /(mertics[0][0] + mertics[0][1])
cat_0_recall = mertics[0][0] /(mertics[0][0] + mertics[0][2])
cat_1_precision = mertics[1][0] /(mertics[1][0] + mertics[1][1])
cat_1_recall = mertics[1][0] /(mertics[1][0] + mertics[1][2])
cat_2_precision = mertics[2][0] /(mertics[2][0] + mertics[2][1])
cat_2_recall = mertics[2][0] /(mertics[2][0] + mertics[2][2])
cat_3_precision = mertics[3][0] /(mertics[3][0] + mertics[3][1])
cat_3_recall = mertics[3][0] /(mertics[3][0] + mertics[3][2])
cat_4_precision = mertics[4][0] /(mertics[4][0] + mertics[4][1])
cat_4_recall = mertics[4][0] /(mertics[4][0] + mertics[4][2])
cat_5_precision = mertics[5][0] /(mertics[0][0] + mertics[5][1])
cat_5_recall = mertics[5][0] /(mertics[0][0] + mertics[5][2])

print('label of relevance')
print(label_precision)
print(label_recall)
print('floods')
print(cat_0_precision)
print(cat_0_recall)
print('explosion')
print(cat_1_precision)
print(cat_1_recall)
print('earthquake')
print(cat_2_precision)
print(cat_2_recall)
print('hurricane')
print(cat_3_precision)
print(cat_3_recall)
print('tornado')
print(cat_4_precision)
print(cat_4_recall)
print('bombing')
print(cat_5_precision)
print(cat_5_recall)

label of relevance
0.9210169491525424
0.9016094242575079
floods
0.8263221153846154
0.8845287873914442
explosion
0.7223935842072795
0.7564599483204134
earthquake
0.9808350444900753
0.9965229485396384
hurricane
0.7978468899521531
0.7065677966101694
tornado
0.7548711502199874
0.6450053705692803
bombing
0.3504833177424384
0.3703459637561779


In [34]:
# show on GeoMap

def ShowGeoMap(loc, tweet):
    
    encoded = tokenizer(tweet.lower(), return_tensors='pt', padding= True)
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)
    token_type_ids = encoded['token_type_ids'].to(device)
    cat_pre,label_pre = model(input_ids, attention_mask, token_type_ids)
    cat_pred = torch.argmax(cat_pre)
    label_pred = torch.round(label_pre)
    if (label_pred > 0):
        print('Alert, it might be a diseaster')
    else:
        print('It might not be a diseaster')
    print('Probability of related to true diseasters:', str(label_pre.item()*100) +' %')
    print('Diseaster Type:', l[cat_pred])
    locator = Nominatim(user_agent='myGeocoder')
    location = locator.geocode(loc)
    center = (location.latitude, location.longitude)
    m = Map(center=center, zoom=5)
    marker = Marker(location=center, draggable=True)
    m.add_layer(marker);
    display(m)

In [87]:
# news on diseasters for demo
p0 = 'Uttarakhand'
t0 = 'The 2021 Uttarakhand flood began on 7 February 2021 in the environs of the Nanda Devi National Park, a UNESCO World Heritage Site[1] in the outer Garhwal Himalayas in Uttarakhand state'
p1 = 'Starved Rock State Park'
t1 = '3 Dead Following Explosion Near Starved Rock State Park'
p2 = 'Truckee'
t2 = 'A magnitude 4.7 quake, the largest of the three, occurred at 9:35 p.m. PT about 12 miles northwest of Truckee, California'
p3 = 'Florida'
t3 = 'DeSantis: $111M infrastructure awards for communities impacted by Hurricane Michael'
p4 = 'Texas'
t4 = 'Report: Tornado frequency dropping in Texas as ‘tornado alley’ shifts'
p5 = 'Boston'
t5 = 'After twin blasts shook Boston – killing three and wounding more than 260 others – investigators sprung into action looking for those responsible.'
p6 = 'san francisco'
t6 = 'Thousands of Californians received notifications for an earthquake they didnt feel. What happened?'

In [90]:
place = 
input_tweet = 
ShowGeoMap(place, input_tweet)

It might not be a diseaster
Probability of related to true diseasters: 0.6484932266175747 %
Diseaster Type: explosion


Map(center=[42.2625621, -71.8018877], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_titl…

In [91]:
# news on diseasters for demo
p0 = 'Uttarakhand'
t0 = 'The 2021 Uttarakhand flood began on 7 February 2021 in the environs of the Nanda Devi National Park, a UNESCO World Heritage Site[1] in the outer Garhwal Himalayas in Uttarakhand state'
p1 = 'Starved Rock State Park'
t1 = '3 Dead Following Explosion Near Starved Rock State Park'
p2 = 'Truckee'
t2 = 'A magnitude 4.7 quake, the largest of the three, occurred at 9:35 p.m. PT about 12 miles northwest of Truckee, California'
p3 = 'Florida'
t3 = 'DeSantis: $111M infrastructure awards for communities impacted by Hurricane Michael'
p4 = 'Texas'
t4 = 'Report: Tornado frequency dropping in Texas as ‘tornado alley’ shifts'
p5 = 'Boston'
t5 = 'After twin blasts shook Boston – killing three and wounding more than 260 others – investigators sprung into action looking for those responsible.'
p6 = 'san francisco'
t6 = 'Thousands of Californians received notifications for an earthquake they didnt feel. What happened?'

In [104]:
place = 'worcester'
input_tweet = 'I feel happy and great today!!!Yeah!!!'
ShowGeoMap(place, input_tweet)

It might not be a diseaster
Probability of related to true diseasters: 0.3091950202360749 %
Diseaster Type: explosion


Map(center=[42.2625621, -71.8018877], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_titl…