# Extreme weather detection

**MASTER THESIS PROJECT**

*Identification and Exploration of Extreme Weather Events From Twitter Data*

**OBS: Should be run in Google Colab to take advantage of GPU**

In this notebook, we load data from different sources. We build classifiers based on our labelled data set from https://crisislex.org/data-collections.html#CrisisLexT6 and with the use cases:

1. Alberta floods https://en.wikipedia.org/wiki/2013_Alberta_floods 
2. Queensland floods https://en.wikipedia.org/wiki/Cyclone_Oswald

The classifiers are build in three different categories within machine learning: Classic ML algorithms, Deep Learning and Transfer learning. After this follows the design of a localisation algorithm for localising the tweets in order to map them. 

The performance for each classifier is evaluated and then used for labelling tweets from the unlabelled data set collected by de Bruijn et al. and used to build the Global Flood Monitor (https://www.globalfloodmonitor.org/). We filtered out a subset of English tweets in the period 2016-2018.

**Pipeline:**

1. Data collection 
2. Data pre-processing

Labelled data
3. Build classifiers (labelled)
4. Build localisation algorithm 
5. Build Visualisation

Unlabelled data
6. Classify tweets - use classifiers
7. Localise tweets - use algorithm
8. Visualise - explore and evaluate

In [None]:
from __future__ import division, print_function
import pandas as pd
import numpy as np
import os 
import json
import math
import random
import string 
import itertools
import requests
import re
import ast 
from ast import literal_eval
import urllib.request
from urllib.request import urlopen
import pathlib
import textwrap
import copy
import calendar
import datetime as dt
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

#!pip install fastai==1.0.61
from fastai.text import *

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk import word_tokenize
import collections
import spacy
#nltk.download('stopwords')
#nltk.download('wordnet')
#!python -m spacy download en_core_web_lg
#from gensim import models

# scikit learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.spatial import distance
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import haversine_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# keras
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding, SimpleRNN
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.backend import clear_session
from keras.models import model_from_json
from keras_preprocessing.text import tokenizer_from_json

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import color_palette
import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff

# mapping
#!pip install geograpy3
#import geograpy
import folium
from folium import FeatureGroup, LayerControl, plugins, Map, Marker
from folium.plugins import FastMarkerCluster, MarkerCluster
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# dash
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State
import dash_dangerously_set_inner_html
from jupyter_dash import JupyterDash

import warnings
warnings.filterwarnings("ignore")

# Data load and pre-processing

## Data clean functions

First load data, then run

- get user info
- get tokens
- add vars

In [None]:
# get user dataframe

def clean_ascii(text):
  # function to remove non-ASCII chars from data
  return ''.join(i for i in text if ord(i) < 128)


def get_userinfo(df_join1):  
    
    df_join1 = df_join1.rename(columns={'id': 'tweet_id'})   
    df_join1['user_id'] = [df_join1['user'][i]['id'] for i in range(len(df_join1))]

    users = [df_join1['user'][i] for i in range(len(df_join1))]
    df_users = pd.DataFrame(users)
    
    cols =['id', 'id_str', 'name', 'screen_name', 'location', 'description', 'url',
            'protected', 'followers_count', 'friends_count',
           'listed_count', 'created_at', 'favourites_count', 'utc_offset',
           'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang']

    df_users1 = df_users[cols].drop_duplicates().reset_index(drop=True)
    df_users1 = df_users1.rename(columns={'id':'user_id','location':'user_location','screen_name':'user_name','name':'user_realname'})
    
    # merge user data with original data
    df_join2 = pd.merge(df_users1[['user_id','user_name','user_realname','user_location']],df_join1, how='inner', on='user_id').reset_index(drop=True)
    df_join2 = df_join2.replace(r'', np.NaN)
    df_join2 = df_join2.drop_duplicates(subset='tweet_id')
    
    # get only english tweets
    df_join2 = df_join2[df_join2['lang']=='en'].reset_index(drop=True)  
    
    # create variabel with removed non-ASCII and replaced RT 
    df_join2['text_clas'] = df_join2['full_text'].apply(clean_ascii).str.replace('RT','')

    return df_join2

In [None]:
# get tokens of full text

def get_tokens(df_join):
    df_join['tokens'] = ""
    
    for i,content in enumerate(df_join['text_clas']):

        if content:
            text = content.translate(str.maketrans('', '', string.punctuation)).split() # split to tokens
            sw = stopwords.words("english") # set stopwords
            wordnet_lemmatizer = WordNetLemmatizer() # set lemmatizer
            words1= [t.lower() for t in text] # lower letters
            words11 = [t for t in words1 if t not in sw]   # remove stopwords
            words2 = [wordnet_lemmatizer.lemmatize(t) for t in words11]  # lemmatize
            words3 = [x for x in words2 if not any(c.isdigit() for c in x)] # remove words with numbers

            df_join['tokens'][i] = words3
            
    return df_join

In [None]:
def add_vars(df):   
   
     # add hashtags variable
    df['hashtags']= [re.findall(r"(#\w+)", s) for s in df['full_text']]
    
    # add date variable
    df['date'] = pd.to_datetime(df['created_at']).dt.date
    df['date'] = pd.to_datetime(df['date'])
    
    # add link
    df['tweet_link'] = [f'https://twitter.com/{user_name}/status/{int(tweet_id)}' for user_name, tweet_id in zip(df['user_name'],df['tweet_id'])]
    
    # keep original tweet text 
    df['tweet'] = df['full_text']

    # wrap full text
    wrapper = textwrap.TextWrapper(width=50)
    for i in range(len(df)):
        df['full_text'][i] = "<br>".join(wrapper.wrap(text=df['full_text'][i]))
        
    ##### retweet variables #####
    
     # drop 'wrong' retweeted variable
    df.drop(columns=['retweeted','retweet_count'],inplace=True)
    
    # get clean text by joining tokens to string
    df['text_clean'] = [' '.join(s) for s in df['tokens']]
    df = df.sort_values(by=['text_clean'])

    # duplicates of full_text
    dupl = df[df.duplicated(subset='text_clean',keep=False)].sort_values("text_clean")
    duplicates = dupl[['text_clean']].groupby(dupl[['text_clean']].columns.tolist()).size().reset_index().rename(columns={0:'duplicates'})
  
    # merge duplicates counts on df
    df = pd.merge(df,duplicates,on='text_clean',how='outer').reset_index(drop=True)
    df['duplicates'].fillna(0, inplace=True)

    # get all duplicates - retweets
    df_org = df[df['duplicates']>0].sort_values(by=['text_clean','created_at'])
    df_org['retweeted'] = [True]*len(df_org)

    # get first unique row - this is the original tweet
    df_orgtweet = df_org.drop_duplicates(subset=['text_clean'],keep='first')
    df_orgtweet['first_tweet'] = [True]*len(df_orgtweet)

    # get tweet ids and mulitply by duplicates - to get original tweet id for retweeted tweets
    ids = [[df_orgtweet['tweet_id'].iloc[i]]*int(df_orgtweet['duplicates'].iloc[i]) for i in range(len(df_orgtweet))] 
    List_flat = list(itertools.chain(*ids))
    df_org['original_tweet_id'] = List_flat

    df_fin = pd.merge(df_org[['tweet_id','original_tweet_id','retweeted']],df_orgtweet[['tweet_id','first_tweet']], on='tweet_id',how='left')

    # all retweets are FALSE in first_tweet
    df_fin['first_tweet']=df_fin['first_tweet'].replace(np.nan,False)

    # merge
    df = pd.merge(df,df_fin,on='tweet_id',how='left')

    # all non retweeted are also TRUE for being first tweet
    df['first_tweet']=df['first_tweet'].replace(np.nan,True)
    df['retweeted']=df['retweeted'].replace(np.nan,False)

    # all non retweeted tweets have their own id as original tweet id
    df['original_tweet_id']= df['original_tweet_id'].fillna(df['tweet_id'])  
    
    # rename vars
    df= df.rename(columns={'duplicates':'retweet_count','text_clean':'text_DL','text_clas':'text_TL'})

    # add type variable
    df['type'] = ""
    for i in range(len(df)):
        if df.retweeted[i]:
            df['type'][i] = 'Retweet'      
        else:
            df['type'][i] = 'Tweet'   
            df['tweet'][i] = df['tweet'][i].replace('RT ','').strip() 

    return df

## Labelled tweets


Adding variables
- hashtags
- retweet_count (= duplicates)
- retweeted (if dupliates then it is a retweeted=True)
- first tweet (if the first tweet in the group of duplicates OR if NOT retweeted, then TRUE)
- original_tweet_id (for retweets this refers to first tweet, else it is just a copy of tweet id)

Load data

In [None]:
# flooding use cases
floods = ['2013_Alberta_floods','2013_Queensland_floods'] 

# load json with tweets for both floods

df = pd.DataFrame()

for flood in floods:

    df1 = pd.read_json(f'Labelled/{flood}_ids.json',lines=True)
    df = pd.concat([df,df1]).reset_index(drop=True)

print(f'Number of tweets: {len(df)}')

# add userinfo, tokens, retweet variables etc.
df2 = get_userinfo(df)
df3 = get_tokens(df2)
df4 = add_vars(df3)
df4.head()

In [None]:
# add label
# load csv with labels 
df_org = pd.DataFrame()

for flood in floods:   

    df1 = pd.read_csv(f'Labelled/{flood}.csv')
    df1['tweet_id'] = [ int(t[1]) for t in df1['tweet id'].str.split("'")]     
    df_org = pd.concat([df_org,df1])

df_org = df_org[df_org.columns[2:]]
df_org = df_org.rename(columns={' label': 'relevant'})
df_org = df_org.replace('off-topic',0).replace('on-topic',1)
df_org = df_org[df_org['tweet_id'].isin(df4['tweet_id'])]

# merge data sets
df_join1 = pd.merge(df4,df_org,on='tweet_id',how='inner')
df_join1['user_id'] = [df_join1['user'][i]['id'] for i in range(len(df_join1))]
df_join1 = df_join1.drop_duplicates(subset=['tweet_id']).reset_index(drop=True)

df = df_join1.copy()
df.head(3)

# export to csv
#df_join1.to_json('Alberta_Queensland_floods.json',orient='split')

In [None]:
# Examples
for i in range(1):
    print(i)
    print(df[(df['relevant']==1) & (df['tweet'].str.contains('flood'))].iloc[i]['tweet'])
    print(df[(df['relevant']==1) & (df['tweet'].str.contains('flood'))].iloc[i]['tweet_link'])

Prepare for classification



In [None]:
#get only first tweets - remove retweets

df_clas = df[df['first_tweet']==True].reset_index(drop=True)
df_clas = df_clas[['tweet_id','text_TL','text_DL','tokens','relevant']]

# 1. Original tweets
df_clas1 = df_clas.copy()

# 2. replace chosen words
df_clas2 = df_clas.copy()
locs = ['yyc','ab','queensland','calgary','alberta','australia','canada','qld','nsw','edmonton','brisbane','bigwet']
for loc in locs:
  df_clas2['text_DL']= df_clas2['text_DL'].str.replace(loc,'')
  df_clas2['text_TL']= df_clas2['text_TL'].str.replace(loc,'',flags=re.IGNORECASE)

df_clas2['tokens'] = df_clas2['text_DL'].str.split()

# 3. replace with place
df_clas3 = df_clas.copy()

V = 0.005*len(df_clas3)
nlp =  spacy.load('en_core_web_lg')

ids = [df_clas3['tokens'].iloc[i] for i in range(len(df_clas3))]
   
wordlist = list(itertools.chain(*ids))
fd = FreqDist(wordlist)
fd = {k: v for k, v in sorted(fd.items(), key=lambda item: item[1]) if v>V}
fd_items = list(fd.items())[::-1]
fd_keys = list(fd.keys())[::-1]

places = nlp(' and '.join(fd_keys)).ents
locs=[]
for ent in places:
    if ent.label_ == 'GPE':
        locs.append(ent.text.lower())

for loc in locs:
  df_clas3['text_DL']= df_clas3['text_DL'].str.replace(loc,'')
  df_clas3['text_TL']= df_clas3['text_TL'].str.replace(loc,'',flags=re.IGNORECASE)

df_clas3['tokens'] = df_clas3['text_DL'].str.split()


Define function to calculate performance metrics

In [None]:
# performance

def performance(y_test,y_pred):

    conf_mat = confusion_matrix(y_test, y_pred)
    
    print('\nConfusion matrix')
    print(conf_mat)
    TN =conf_mat[0][0]
    FP =conf_mat[0][1]
    FN =conf_mat[1][0]
    TP =conf_mat[1][1]

    acc = (TP+TN)/(TP+TN+FP+FN)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    F1 = 2*rec*prec/(rec+prec)
    
    print('Accuracy:',(acc))
    print('Precision: ', prec)
    print('Recall: ', rec)
    print('F1 Score: ', F1)

    return conf_mat

## Exploration

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(df['retweet_count'], color='#05264c', rug=True, kde=True)
plt.xlabel('retweet_count',fontsize=14)
plt.ylabel('density',fontsize=14)
plt.tight_layout()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(y=df['retweet_count'],color='#05264c')
plt.ylabel('retweet_count',fontsize=14)
plt.tight_layout()

In [None]:
def bar_chart(col,num):
    df[col].value_counts()[:num].plot.barh(color='#05264c', figsize=(10, 5))
    plt.title(f'Top {num} {col}s')
    plt.gca().invert_yaxis()
    plt.xlabel('count')
    plt.ylabel(col)
    plt.tight_layout()

bar_chart('user_location',20)

In [None]:
sns.set()
pairplot = sns.pairplot(df[['retweet_count','user_followers_count']], diag_kind='kde', palette='#05264c')
plt.show()

In [None]:
# number of tweets over time
month_count = np.unique(df['created_at_month'],return_counts=True)
months = month_count[0]
plt.figure(figsize=(10, 5))
bins = np.arange(1,13)
plt.hist(df['created_at_month'], bins=bins, color='#05264c')
plt.ylabel('count')
plt.xlabel('month')
plt.xticks(bins)
plt.show()

#  Classification 


First look at the distribution of tweets in the relevant/irrelevant


In [None]:
y = df_clas['relevant']
plt.figure(figsize=(10,4))

print('Number of tweets:', len(df_clas))
print('Relevant:', sum(y), 'that is:' , round(sum(y)/len(y)*100,2), '%')
print('Non-relevant:', len(y)-sum(y), 'that is:' ,round((len(y)-sum(y))/len(y)*100,2),'%')

p1 = plt.barh(1,100,color='lightgreen')
p2 = plt.barh(1,(len(y)-sum(y))/len(y)*100,color='firebrick')

plt.legend((p1[0], p2[0]), ('Relevant', 'Non-relevant'),loc='upper center')
plt.yticks([1,1.8])
plt.title('Class balance',fontsize=16)
#plt.axis('off')


plt.show()

## Classic algorithms 

Here, we use the variable 'tokens'



In [None]:
# define tfidf
tfidf = TfidfVectorizer(preprocessor=' '.join)

# test-train split
size = [0.2,0.8]

def text_fit(X, y, model,clf_model,size):   
    X_c = model.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0,test_size=size[0], train_size=size[1])
    clf = clf_model.fit(X_train, y_train)
    pred = clf.predict(X_test)
    performance(y_test.tolist(), pred)
   
    return clf,X_c,model


def print_words(model,clf_model,out):
        w = model.get_feature_names()
        if clf_model==clf_log:
          coef = clf_model.coef_.tolist()[0]
          STR = 'Coefficient'
          coeff_df = pd.DataFrame({'Word' : w, STR : coef})
          coeff_df = coeff_df.sort_values([STR, 'Word'], ascending=[0, 1])

          print('')
          print('-Top 5 relevant-')
          print(coeff_df.head(5).to_string(index=False))
          print('')
          print('-Top 5 non-relevant-')        
          print(coeff_df.tail(5).to_string(index=False))

        else:
          coef = clf_model.feature_importances_
          STR = 'Score'
          coeff_df = pd.DataFrame({'Word' : w, STR : coef})
          coeff_df = coeff_df.sort_values([STR, 'Word'], ascending=[0, 1])
          print('')
          print('-Top 10 important-')
          print(coeff_df.head(10).to_string(index=False))
  
        return coeff_df
    



In [None]:
# define for each set 

X1 = df_clas1['tokens']
y1 = df_clas1['relevant']

X2 = df_clas2['tokens']
y2 = df_clas2['relevant']

X3 = df_clas3['tokens']
y3 = df_clas3['relevant']

*Logistic Regression*

In [None]:
print('1. Original tweets')
clf_log,X_c_log,tfidf_log = text_fit(X1, y1, tfidf, LogisticRegression(),size)
_ = print_words(tfidf,clf_log,True)

print('\n2. Remove keywords')
clf_log,X_c_log,tfidf_log = text_fit(X2, y2, tfidf, LogisticRegression(),size)
_ = print_words(tfidf,clf_log,True)
filename = 'LR.sav'
pickle.dump(clf_log, open(filename, 'wb'))
filename = 'LR_tfidf.sav'
pickle.dump(tfidf_log, open(filename, 'wb'))


print('\n3. Replace with place')
clf_log,X_c_log,tfidf_log = text_fit(X3, y3, tfidf, LogisticRegression(),size)
_ = print_words(tfidf,clf_log,True)

*Random Forrest*

In [None]:
print('1. Original tweets')
clf_RF,X_c_RF,tfidf_RF = text_fit(X1, y1, tfidf, RandomForestClassifier(max_depth=9, random_state=0),size)
_ = print_words(tfidf,clf_RF,True)

print('\n2. Remove keywords')
clf_RF,X_c_RF,tfidf_RF = text_fit(X2, y2, tfidf, RandomForestClassifier(max_depth=9, random_state=0),size)
_ = print_words(tfidf,clf_RF,True)
filename = 'RF.sav'
pickle.dump(clf_RF, open(filename, 'wb'))
filename = 'RF_tfidf.sav'
pickle.dump(tfidf_RF, open(filename, 'wb'))


print('\n3. Replace with place')
clf_RF,X_c_RF,tfidf_RF = text_fit(X3, y3, tfidf, RandomForestClassifier(max_depth=9, random_state=0),size)
_ = print_words(tfidf,clf_RF,True)

## Deep learning

We use the variable 'text_DL'


### Pre-processing



In [None]:
# load google news Word2Vec model 
word2vec_path = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
def split(df_clas):
  # splitting data into test and train
  data_train, data_test = train_test_split(df_clas, test_size=0.20, random_state=42)

  print('Training vocabulary ')
  # build training vocabulary
  all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
  training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
  TRAINING_VOCAB = sorted(list(set(all_training_words)))
  print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
  print("Max sentence length is %s" % max(training_sentence_lengths))

  print('\nTesting vocabulary ')
  # build testing vocabulary 
  all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
  test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
  TEST_VOCAB = sorted(list(set(all_test_words)))
  print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
  print("Max sentence length is %s" % max(test_sentence_lengths))

  return data_train,data_test

In [None]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing))
    return list(embeddings)

In [None]:

def final_embeddings(word2vec,data_train,data_test, MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,RANDOM_STATE):

  # get embeddings
  training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

  # Tokenize and transform to integer index
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(data_train['text_DL'])

  X_train = tokenizer.texts_to_sequences(data_train['text_DL'])
  X_test = tokenizer.texts_to_sequences(data_test['text_DL'])

  vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
  train_word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(train_word_index))

  # Add pading to ensure all vectors have same dimensionality
  X_train = pad_sequences(X_train, padding='post', maxlen=MAX_SEQUENCE_LENGTH)
  y_train = data_train['relevant']

  X_test = pad_sequences(X_test, padding='post', maxlen=MAX_SEQUENCE_LENGTH)

  #train embeddings 
  train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
  y_test = data_test['relevant']

  for word,index in train_word_index.items():
      train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)

  print(train_embedding_weights.shape)

  return X_train, y_train, X_test, y_test, train_embedding_weights, vocab_size, tokenizer

In [None]:
def predictions(data_test,model,tokenizer):

    #tokenizer.fit_on_texts(df['full_text'].tolist())
    sequences = tokenizer.texts_to_sequences(data_test['text_DL'])

    MAX_SEQUENCE_LENGTH  = 30
    new_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions =model.predict_classes(new_data, batch_size=1024, verbose=1)

    prediction_labels =  [i[0] for i in predictions.tolist()]
            
    return  prediction_labels

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
#parameters
MAX_SEQUENCE_LENGTH = 30 # longest text in train set
EMBEDDING_DIM = 300
RANDOM_STATE = 42
EPOCHS = 3
BS = 64

### CNN

In [None]:
# Define CNN architecture
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from keras.regularizers import l2


def train_CNN(df_clas,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,RANDOM_STATE,EPOCS,BS):

  data_train, data_test = split(df_clas)
  print('\n')
  X_train, y_train, X_test, y_test, train_embedding_weights,vocab_size,tokenizer = final_embeddings(word2vec,data_train,data_test,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,RANDOM_STATE)

  model = Sequential()
  model.add(layers.Embedding(vocab_size, EMBEDDING_DIM,weights=[train_embedding_weights], input_length=MAX_SEQUENCE_LENGTH))
  model.add(layers.Conv1D(128, 5, activation='relu',kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(32, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
  model.add(layers.Dense(10, activation='relu'))
  model.add(layers.Dropout(0.1))
  model.add(layers.Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

# Fit model
  print('\n')
  print(model.summary())

  # Fit model

  num_epochs = EPOCHS
  batch_size = BS

  hist = model.fit(X_train, 
                  y_train, 
                  epochs=num_epochs, 
                  validation_split=0.2, 
                  shuffle=True, 
                  batch_size=batch_size)


  print('\n')
  loss, accuracy = model.evaluate(X_train, y_train, verbose=True)
  print("Training Accuracy: {:.4f}".format(accuracy))
  loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
  print("Testing Accuracy:  {:.4f}".format(accuracy))

  predictions =model.predict_classes(X_test, batch_size=1024, verbose=1)
  prediction_labels =  [i[0] for i in predictions.tolist()]

  performance(y_test,prediction_labels)

  return model, hist, tokenizer


In [None]:
print('1. Original tweets\n')
CNN1, hist1,tokenizer1 = train_CNN(df_clas1,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,RANDOM_STATE,EPOCHS,BS)
plt.style.use('ggplot')

%matplotlib inline
plot_history(hist1)


In [None]:
print('2. Remove keywords\n')

CNN2,hist2,tokenizer2 = train_CNN(df_clas2,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,RANDOM_STATE,EPOCHS,BS)
plt.style.use('ggplot')

%matplotlib inline
plot_history(hist2)

In [None]:
print('3. Replace with place\n')
CNN3, hist3,tokenizer3 = train_CNN(df_clas3,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,RANDOM_STATE,EPOCHS,BS)
plt.style.use('ggplot')

%matplotlib inline
plot_history(hist3)

In [None]:
# save model
# MLP for Pima Indians Dataset Serialize to JSON and HDF5

# serialize model to JSON
model_json = CNN2.to_json()
with open("CNN.json", "w") as json_file:
    json_file.write(model_json)

tokenizer_json = tokenizer2.to_json()
with open('CNN_tokenizer.json','w') as f:
    f.write(tokenizer_json)

# serialize weights to HDF5
CNN2.save_weights("CNN.h5")
print("Saved model to disk")


In [None]:
from sklearn.manifold import TSNE

def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=500, workers=4)
tsne_plot(model)

##  Transfer learning 

Using variable 'text_TL'

### ULMFiT 

Build with inspiration from the following:

https://towardsdatascience.com/transfer-learning-in-nlp-for-tweet-stance-classification-8ab014da8dde

https://github.com/floleuerer/fastai_ulmfit

In [None]:
def get_data(df_clas):
  size = [0.2,0.8]

  X = df_clas['text_TL']
  y = df_clas['relevant']

  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,test_size=size[0], train_size=size[1])

  df_train = pd.DataFrame()
  df_test = pd.DataFrame()

  df_train['relevant'] = y_train
  df_train['text'] = X_train 

  df_test['relevant'] = y_test
  df_test['text'] = X_test

  return df_train, df_test

In [None]:
# define for three different text variables

df_train1, df_test1 = get_data(df_clas1)
df_train2, df_test2 = get_data(df_clas2)
df_train3, df_test3 = get_data(df_clas3)

#### Language model fine-tuning 


In [None]:
# Language model data

def langmodel_data(df_train,df_test):
  data_lm = TextLMDataBunch.from_df('', train_df=df_train, valid_df=df_test,min_freq=1,text_cols='text')
  data_lm.show_batch()

  # Save the language model data for re-use
  data_lm.save()
  
  return data_lm

In [None]:
def langmodel_train(data_lm,enc_name):

  # Language model
    learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)

    learn.lr_find(start_lr=1e-8, end_lr=1e2)
    learn.recorder.plot()

    #learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
    learn.fit_one_cycle(cyc_len=1, max_lr=1e-2,moms=(0.8, 0.7))

    learn.unfreeze()
    learn.fit_one_cycle(cyc_len=8, max_lr=1e-2,moms=(0.8, 0.7))

    # Save the fine-tuned encoder
    learn.save_encoder(enc_name)

    return learn

#### Classification

In [None]:
# Classifier model data
def classifier_data(df_train,df_test,data_lm):
  data_clas = TextClasDataBunch.from_df('', train_df=df_train, valid_df=df_test, vocab=data_lm.train_ds.vocab, min_freq=1,bs=32)
  data_clas.save()

  return data_clas

In [None]:
# Classifier

def classifier_train(data_clas,enc_name):
  classifier = text_classifier_learner(data_clas, AWD_LSTM,drop_mult=0.5)
  classifier.load_encoder(enc_name)

  classifier.lr_find(start_lr=1e-8, end_lr=1e2)
  classifier.recorder.plot()

  classifier.fit_one_cycle(cyc_len=1, max_lr=1e-2, moms=(0.8, 0.7))
  classifier.recorder.plot_losses()
  classifier.freeze_to(-2)
  classifier.fit_one_cycle(1, slice(1e-4,1e-2), moms=(0.8,0.7))
  classifier.freeze_to(-3)
  classifier.fit_one_cycle(1, slice(1e-5,5e-3), moms=(0.8,0.7))
  classifier.unfreeze()
  classifier.fit_one_cycle(3, slice(1e-5,1e-2), moms=(0.8,0.7))

  return classifier
  

#### Build models

*Original tweets*

In [None]:
# language learner 
data_lm1 = langmodel_data(df_train1,df_test1)
learn1 = langmodel_train(data_lm1,'ft_enc1')

In [None]:
# classification
data_clas1 = classifier_data(df_train1,df_test1,data_lm1)
classifier1 = classifier_train(data_clas1,'ft_enc1')

In [None]:
# export model
classifier1.export('ULMFiT1.pkl')


*Remove keywords*

In [None]:
# language learner 
data_lm2 = langmodel_data(df_train2,df_test2)
learn2 = langmodel_train(data_lm2,'ft_enc2')

In [None]:
# classification
data_clas2 = classifier_data(df_train2,df_test2,data_lm2)
classifier2 = classifier_train(data_clas2,'ft_enc2')

In [None]:
classifier2.export('ULMFiT2.pkl')

*Replace places*

In [None]:
# language learner 
data_lm3 = langmodel_data(df_train3,df_test3)
learn3 = langmodel_train(data_lm3,'ft_enc3')

In [None]:
# classification
data_clas3 = classifier_data(df_train3,df_test3,data_lm3)
classifier3 = classifier_train(data_clas3,'ft_enc3')

In [None]:
classifier3.export('ULMFiT3.pkl')

#### Performance

In [None]:
# get predictions
#cl = load_learner('','ULMFiT1.pkl')
classifier1.data.add_test(df_test1['text'])
preds, _ = classifier1.get_preds(ds_type=DatasetType.Test)
targets =df_test1['relevant']
predictions = np.argmax(preds, axis=1)

_ = performance(targets,predictions)

In [None]:
# get predictions
#cl = load_learner('','ULMFiT1.pkl')
classifier2.data.add_test(df_test2['text'])
preds, _ = classifier2.get_preds(ds_type=DatasetType.Test)
targets =df_test2['relevant']
predictions = np.argmax(preds, axis=1)

_ = performance(targets,predictions)

In [None]:
# get predictions
#cl = load_learner('','ULMFiT1.pkl')
classifier3.data.add_test(df_test3['text'])
preds, _ = classifier3.get_preds(ds_type=DatasetType.Test)
targets =df_test3['relevant']
predictions = np.argmax(preds, axis=1)

_ = performance(targets,predictions)

# Localisation 



In this section, we look into getting a location for the tweets. This is prioritesed in the four levels stated here.

*Note that the “coordinates” attributes is formatted as [LONGITUDE, latitude], while the “geo” attribute is formatted as [latitude, LONGITUDE].

1. Geotagged coordinates
2. Geotagged place
3. Geoparsed from text
4. Registered user location


## Geo (coordinate) attribute  and Place attribute 

- get centers


In [None]:
def get_centers(df_rel):
    
    #users with places and not geo
    place_df = df_rel[(df_rel['geo'].isna()) & (df_rel['place'].notna())].reset_index(drop=True)

    # add place_id
    place_df['place_id'] =[place_df['place'][i]['id'] for i in range(len(place_df))]

    # get dataframe with places metadata
    places = [place_df['place'][i] for i in range(len(place_df))]
    df_places= pd.DataFrame(places)
    df_places= df_places.rename(columns={'id':'place_id'})
    df_places = df_places.drop_duplicates(subset=['place_id'])

    # merge to get all place details
    place_df2 =pd.merge(place_df.drop(columns=['place']),df_places, on='place_id',how='inner')

    cols=['tweet_id','place_type', 'name',
           'full_name', 'country',
           'bounding_box']
    # merge to all data

    df_rel = pd.merge(df_rel,place_df2[cols],on='tweet_id',how='left').reset_index(drop=True)
    df_rel['geo'] = df_rel['geo'].fillna(df_rel['bounding_box'])
    
    # get centers of polygons
    # create variable with type point or polygon
    # create 'center' variable with either point or center of polygon

    df_try =  df_rel[~df_rel['geo'].isna()].reset_index(drop=True)
    df_try['location_type'] = [g['type'] for g in df_try['geo']]
    df_try['coordinates2'] = [g['coordinates'] for g in df_try['geo']]

    # get centers of polygon or just point
    centers = []

    for i in range(len(df_try)):
        if df_try['location_type'][i]=='Polygon':      
            center = list(np.average(df_try['coordinates2'][i][0],axis=0))[::-1]
            centers.append(center)
        else:
            centers.append(df_try['coordinates2'][i])
    df_try['centers'] = centers

    # merge with df_rel
    df_rel = pd.merge(df_rel,df_try[['tweet_id','location_type','centers']],on='tweet_id',how='left').reset_index(drop=True)
    
    return df_rel

## Geo-parsing 

1) toponym recognition

Use spacy entitity recognition

In [None]:
def geoparsing(df_rel):

    #import en_core_web_sm
    #nlp = en_core_web_sm.load()
    
    nlp = spacy.load('en_core_web_lg')

    df_rel['geoparsing'] = ''

    for i in range(len(df_rel)):      
        
        if (i % 2500) == 0:
            print(i)
            
        places = nlp(df_rel['tweet'][i]).ents
        locs=[]
        for ent in places:
            if ent.label_ == 'GPE':
                locs.append(ent.text.lower())
            if len(locs)!=0:
                df_rel['geoparsing'][i] = locs
            else:
                df_rel['geoparsing'][i] = ['NAN']

    # get count of matches
    df_rel['geoparsing_count'] = [len(m) for m in df_rel['geoparsing']]

    #replace empty
    df_rel['geoparsing'] = [np.unique(row) for row in df_rel['geoparsing']]

    return df_rel

2) Look up table for coordinates

- user location 
- geoparsed locations


In [None]:
# get unique user locations appearing over 10 times in list

def user_locs(df_rel,thres):

    df_rel['user_location_lower'] = ''

    for i,u in enumerate(df_rel['user_location'].str.lower()):
        try:
            df_rel['user_location_lower'][i] = u.split(',')[0] 
        except:
            None

    # get unique user locations mentioned more than threshold

    values, counts = np.unique(list(df_rel['user_location_lower']),return_counts=True)

    vals = values[counts>=thres]
    user_list1 = vals[(vals!='') & (vals!='NAN')]

    user_list = list(np.unique([u.split(',')[0] for u in user_list1]))
    
    return user_list

   

In [None]:
def geo_locs(df_rel,thres):
       
    # get list of geoparsed locations
    gps = list(df_rel['geoparsing'])
    flat_gps = list(itertools.chain(*gps))


    # get unique geoparsed location mentioned more than threshold
    values, counts = np.unique(flat_gps,return_counts=True)

    vals = values[counts>=thres]
    gp_list = list(vals[(vals!='') & (vals!='NAN')])
    
    return gp_list



In [None]:
def lookup_table(user_list,gp_list):
     
    # initialize geolocator
    geolocator = Nominatim(user_agent='my_app') 
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

    # create look up table for unique locations - combining user and geoparsed
    df_lookup = pd.DataFrame()
    df_lookup['location'] = list(np.unique(user_list+gp_list))

    # get coordinates
    df_lookup['coordinates']= ''

    for i,loc in enumerate(df_lookup['location']):  

        if (i % 100) == 0:
            print(i)

        if str(loc)!='nan':
            try:
                df_lookup['coordinates'][i]=[geolocator.geocode(loc).latitude,geolocator.geocode(loc).longitude]
            except:
                df_lookup['coordinates'][i] = np.nan
        else:
            df_lookup['coordinates'][i] = np.nan

    df_lookup=df_lookup.dropna()
    
    return df_lookup


In [None]:
def add_coords(df_rel,df_lookup):
    # add index to flatten out geoparsing - when more than one location
    df_rel['index'] = df_rel.index

    b_flat = pd.DataFrame([[i, x] 
                   for i, y in df_rel['geoparsing'].apply(list).iteritems() 
                       for x in y], columns=list('IB'))
    b_flat = b_flat.set_index('I')

    df_match = df_rel[['index']].merge(b_flat, left_index=True, right_index=True)
    df_match.rename(columns={'B':'location'},inplace=True)

    # look up geoparse locations and join to main data set df_rel
    df_merge1 = pd.merge(df_match,df_lookup,on='location').sort_values(by='index')
    hej = pd.DataFrame(df_merge1.groupby(['index'])['coordinates'].apply(list)).reset_index()
    hej.rename(columns={'coordinates':'gp_coords'},inplace=True)
    df_rel2 = pd.merge(df_rel,hej,on='index',how='left')
    
    # get user coordinates using look up table
    df_rel2['location'] = df_rel2['user_location_lower']
    df_rel3 = pd.merge(df_rel2,df_lookup.rename(columns={'coordinates':'user_coordinates'}),on='location',how='left')
    df_rel3.drop(columns=['location','index'],inplace=True)


    # drop if does not have any coordinates available
    df_rel4 = df_rel3.dropna(subset=['centers','gp_coords', 'user_coordinates'],how='all').reset_index(drop=True)
    
    return df_rel4



*2) toponym resolution*


Geoparse coords
    - if geoparse coords have two locations closer than 1500 km, randomnly choose one
    - if geoparse coords have more than two then take two with shortest distance, if below 1500 km, randomnly choose one
    - else just use coords



In [None]:
def final_geocoords(df_rel4):
    
    # define dataframe with only geoparsing locations
    df_gp =df_rel4[df_rel4['gp_coords'].notna()].reset_index(drop=True)

    # get count of matches
    df_gp['gp_count'] = [len(m) for m in df_gp['gp_coords']]
  
    earth = 6371000/1000

    # define new variable to have the chosen coordinates
    df_gp['gp_coords1'] = ''
    df_gp['distance'] = ''

    for j in range(len(df_gp)):

        x = df_gp['gp_coords'].iloc[j]
        count = df_gp['gp_count'].iloc[j]

        # if more than 2 locations, get two closest
        if count>2:    
            xx = [[radians(i[0]),radians(i[1])] for i in x]
            mat = np.round(haversine_distances(xx, xx)*earth)
            idx = list(np.unravel_index(np.where(mat!=0, mat, mat.max()+1).argmin(), mat.shape))

            if mat[idx[0],idx[1]]<1500:
              rand = np.random.choice([0,1])
              new_coords = x[idx[rand]]
            else:
              new_coords = 'NAN'

            df_gp['gp_coords1'].iloc[j]=new_coords
            df_gp['distance'].iloc[j] = mat[idx[0],idx[1]]


            #new_coords = [np.average([x[idx[0]][0],x[idx[1]][0]]),np.average([x[idx[0]][1],x[idx[1]][1]])] 
            
        elif count==2:
     
            xx = [[radians(i[0]),radians(i[1])] for i in x]
            mat = np.round(haversine_distances(xx, xx)*earth)

            if mat[0,1]<1500:
              rand = np.random.choice([0,1])
              new_coords = x[rand]
            else:
              new_coords = 'NAN'

          #  new_coords = [np.average([x[0][0],x[1][0]]),np.average([x[0][1],x[1][1]])]
            df_gp['gp_coords1'].iloc[j]=new_coords
            df_gp['distance'].iloc[j] = mat[0,1]
            
    
        elif count==1:   
            df_gp['gp_coords1'].iloc[j] = x[0]
            df_gp['distance'].iloc[j] = 0
          
       
        
    # merge with final dataframe
    df_rel5 = pd.merge(df_rel4,df_gp[['tweet_id','gp_coords1','distance']],on='tweet_id',how='left').reset_index(drop=True)
            
    return df_rel5



1. 'centers' if location_type = Point
2. 'centers' if location_type = Polygon
3. geoparse coords
5. user coords

In [None]:
def final_data(df_rel5):

    #replace np.nan with NAN - only way to make if statement
    df_rel5['gp_coords1'] = df_rel5['gp_coords1'].replace(np.nan,'NAN')
    df_rel5['user_coordinates'] = df_rel5['user_coordinates'].replace(np.nan,'NAN')


    # define columns for final coordinates - type
    # 1 'centers' if location_type = Point
    # 2  'centers' if location_type = Polygon
    # 3  geoparse coords
    # 4 user coords
    # NAN

    df_rel5['final_coords'] = ''
    df_rel5['localization'] = ''

    for i in range(len(df_rel5)):

            # 'centers' if location_type = Point
        if df_rel5['location_type'].iloc[i] == 'Point':
            df_rel5['final_coords'].iloc[i] = df_rel5['centers'].iloc[i]
            df_rel5['localization'].iloc[i]  = 'Geotagged coordinates'

            # 'centers' if location_type = Polygon

        elif df_rel5['location_type'].iloc[i] == 'Polygon':
            df_rel5['final_coords'].iloc[i] = df_rel5['centers'].iloc[i]
            df_rel5['localization'].iloc[i]  = 'Geotagged place'

            # geoparse coords 
        elif df_rel5['gp_coords1'].iloc[i] !='NAN':
            df_rel5['final_coords'].iloc[i] = df_rel5['gp_coords1'].iloc[i]
            df_rel5['localization'].iloc[i]  = 'Geoparsed from Tweet'

            # user coords
        elif df_rel5['user_coordinates'].iloc[i] !='NAN':
            df_rel5['final_coords'].iloc[i] = df_rel5['user_coordinates'].iloc[i]
            df_rel5['localization'].iloc[i]  = 'Registered user location'

             # nan
        else:
            df_rel5['final_coords'].iloc[i] = 'NAN'
            df_rel5['localization'].iloc[i]  = 'NAN'


    df_rel5 = df_rel5[df_rel5['final_coords']!='NAN'].reset_index(drop=True)
    
    return df_rel5



Some coords are duplicates. In order to map them, we add noise to them using kernel density estimation. 

Mean = coordinates; Sigma = [0.7,0][0,0.7], N = number of duplicates

In [None]:

def addnoise_coords(final_df):

    # get as string
    final_df['final_coords_str'] = final_df['final_coords'].astype(str)

    # duplicates of full_text
    dupl = final_df[final_df.duplicated(subset='final_coords_str',keep=False)].sort_values("final_coords_str")
    print(f'Number of duplicate coordinates: {len(dupl)} corresponding to {np.round(len(dupl)/len(final_df)*100,1)} %')
    duplicates = dupl[['final_coords_str']].groupby(dupl[['final_coords_str']].columns.tolist()).size().reset_index().rename(columns={0:'duplicates'})
    
    
     # merge duplicates counts on df
    final_df = pd.merge(final_df,duplicates,on='final_coords_str',how='outer').reset_index(drop=True)
   
    final_df['duplicates'].fillna(0, inplace=True)
    
    
    duplicates['final_coords'] =  [literal_eval(s) for s in duplicates['final_coords_str']]
    
    # randomize coords
    duplicates['new_coords'] = ''

    for i in range(len(duplicates)):
        start = duplicates['final_coords'][i]    
        N=duplicates['duplicates'][i]

        cov = [[0.7, 0], [0, 0.7]]
        merged = np.random.multivariate_normal(start, cov, N)
        duplicates['new_coords'][i] = merged 
        
    # get list of randomized coords 
    ids2 = list(duplicates['new_coords'])
    List_flat2 = list(itertools.chain(*ids2))
    List_flat2 = [list(l) for l in List_flat2]
    
    # get all duplicates and add variable with new random coords
    final_df2 = final_df[final_df['duplicates']>0].sort_values(by=['final_coords_str']).reset_index(drop=True)
    final_df2['random_coords'] = List_flat2

    # merge
    final_df = pd.merge(final_df.drop(columns=['duplicates']),final_df2[['tweet_id','random_coords']],on='tweet_id',how='left')
    final_df.rename(columns={'final_coords':'org_coords','random_coords':'final_coords'},inplace=True)

    
    final_df = final_df.drop(['geo', 'coordinates', 'place', 'location_type', 'centers', 'geoparsing', 'geoparsing_count', 'user_location_lower', 
         'gp_coords', 'user_coordinates', 'gp_coords1', 'distance', 
         'final_coords_str'], axis=1)
    
    final_df['final_coords']=final_df['final_coords'].fillna(final_df['org_coords'])

    final_df['lat'] = [c[0] for c in final_df['final_coords']]
    final_df['lon'] = [c[1] for c in final_df['final_coords']]
    
    return final_df


## Final algorithm 

In [None]:
def localization(df_rel,thres):
    start1 = time.time()
    df_rel = get_centers(df_rel)
    df_rel = geoparsing(df_rel)
    df_rel.to_json('geoparsed.json',orient='split')
    
    # look up table
    start = time.time()
    user_list = user_locs(df_rel,thres)
    gp_list = geo_locs(df_rel,thres)    
    df_lookup = lookup_table(user_list,gp_list)
    end = time.time()
    print('Lookup table DONE - ',(end-start)/60,' minutes')
    
    # add coordinates
    start = time.time()
    df_rel4 = add_coords(df_rel,df_lookup)    
    df_rel5 = final_geocoords(df_rel4)
    end = time.time()
    print('Added coords DONE - ',(end-start)/60, ' minutes')

    # get final coordinates
    start = time.time()
    final_df = final_data(df_rel5)
    final_df2 = addnoise_coords(final_df)
    end = time.time()
    print('Final coords DONE - ',(end-start)/60, ' minutes')

    # get only relevant columns
    keep_cols = ['tweet_id','created_at','date','user_location','user_name','source','retweeted','type','retweet_count','hashtags','full_text','tweet','lat','lon','localization','org_coords','relevant_LR', 'relevant_RF',
       'relevant_CNN', 'relevant_ULM']
    final_df3 = final_df2[keep_cols]

    end1 = time.time()

    print('Complete time - ', (end1-start1)/60, ' minutes')
 
    
    return df_lookup,df_rel5, final_df3

### Use algorithm

In [None]:
# get relevant tweets
df_rel2 = df[df['relevant']==1]
print('Relevant tweets:',len(df_rel2), '/', len(df))

#subset
size = len(df_rel2)
df_rel = df_rel2.sample(size).reset_index(drop=True)

In [None]:
geo_df = df_rel[~df_rel['geo'].isna()]
#filter only relevant tweets
geo_df = geo_df[geo_df['relevant']==1].reset_index(drop=True)
geo_df['coords'] = [geo_df['geo'][i]['coordinates'] for i in range(len(geo_df))]
print('Geotagged tweets:',len(geo_df), '/', len(df_rel))


userloc_df = df_rel[~df_rel['user_location'].isna()]
noloc_df = df_rel[(df_rel['user_location'].isna()) & (df_rel['geo'].isna())  & (df_rel['place'].isna()) ]
noloc_df2 = df_rel[(df_rel['geo'].isna()) & (df_rel['place'].isna()) ]

print(f'total : {len(df_rel)}')
print(30*'_' + '\n')

print(f'geotagged : {len(geo_df)}')
#print(f'places : {len(place_df2)}')
print(f'user location : {len(userloc_df)}')
print(f'no location : {len(noloc_df)}')


In [None]:
import time

start = time.time()
df_lookup,df_rel5, final_df = localization(df_rel,10)
end = time.time()

elapse = end-start
print(elapse)

In [None]:
#final_df.to_csv('final_tweets.csv',index = False)

In [None]:
# map tweets
fig = px.scatter_geo(lat = final_df['lat'],lon=final_df['lon'],hover_name=final_df['tweet'])
fig.show()

In [None]:
# map look up table

df_lookup['lat'] = [c[0] for c in df_lookup['coordinates']]
df_lookup['lon'] = [c[1] for c in df_lookup['coordinates']]

fig = px.scatter_geo(lat = df_lookup['lat'],lon=df_lookup['lon'],hover_name=df_lookup['location'])
fig.show()

# Visualisation



In [None]:
df = pd.read_csv('data/eng_labelled_tweets.csv') # final_tweets, final_coords_tweets

# filter only relevant tweets
geo_df = df[~df['geo'].isna()].reset_index(drop=True)
geo_df = geo_df[geo_df['relevant']==1].reset_index(drop=True)
for i in range(len(geo_df)):
    try: 
        geo_df['geo'][i] = eval(geo_df['geo'][i])
    except:
        geo_df['geo'][i] = geo_df['geo'][i]

#geo_df['coords'] = [geo_df['geo'][i]['coordinates'] for i in range(len(geo_df))]
geo_df['lat'] = [geo_df['geo'][i]['coordinates'][0] for i in range(len(geo_df))]
geo_df['lon'] = [geo_df['geo'][i]['coordinates'][1] for i in range(len(geo_df))]

#geo_df.to_csv('data/geo_tweets.csv',index=False)

## Folium, Leaflet, OpenStreetMap

In [None]:
# base map
m = folium.Map([20.416775, -3.70379], tiles=None, zoom_start=2)

# tile layers
folium.TileLayer('cartodbpositron', show=True, name="light").add_to(m)
folium.TileLayer('cartodbdark_matter', show=False, name="dark").add_to(m)
folium.TileLayer('openstreetmap', show=False, name="color").add_to(m)

# add location marker cluster
mc = MarkerCluster(name='Tweets').add_to(m)

# create marker at locations
for lat, lon, user_location, full_text, created_at, retweet_count in zip(geo_df['lat'], geo_df['lon'], geo_df['user_location'], 
                                     geo_df['full_text'], geo_df['created_at'], geo_df['retweet_count']):
    text = folium.Html('Tweet: {}<br> User location: {}<br> Created at: {}<br> Retweet count: {}<br>'.format(full_text, user_location, created_at, retweet_count), script=True)
    popup = folium.Popup(text, max_width=300)
    folium.CircleMarker(location = [lat, lon],
                        radius = 2,
                        weight = 5,
                        color = '#081d58',
                        fill_color = '#081d58',
                        fill = True,
                        popup = popup,
                        tooltip = 'Click on Tweet'
                        ).add_to(mc)
mc.add_to(m)

# add layer control
folium.LayerControl('topright', collapsed=True).add_to(m)

m

## Kepler.gl

https://medium.com/nightingale/how-to-create-eye-catching-maps-with-python-and-kepler-gl-e7e897eff8ac

In [None]:
viz_cols = ['lon','lat','created_at','full_text','user_location']
kepler_map = keplergl.KeplerGl(height=500)
kepler_map.add_data(data=geo_df[viz_cols], name="Extreme weather events")
#kepler_map.save_to_html(file_name="kepler_map.html")
kepler_map

In [None]:
#config = kepler_map.config
#config

## Plotly, Mapbox

In [None]:
# point map
scatter_map = px.scatter_mapbox(geo_df, lat="lat", lon="lon", hover_name="full_text", 
                        hover_data=["created_at", "user_location",'retweeted'],
                        color_discrete_sequence=["teal"], 
                        zoom=1, height=500)
scatter_map.update_layout(mapbox_style="carto-positron",margin={"r":0,"t":0,"l":0,"b":0})
scatter_map.show()

In [None]:
scatter_map = px.scatter_mapbox(
    geo_df, lat="lat", lon="lon", 
    hover_data=['full_text',"created_at","user_location"],
    color = 'retweet_count',
    color_continuous_scale='teal',
    zoom=1, height=500)
scatter_map.update_layout(mapbox_style="dark",
                          margin={"r":0,"t":0,"l":0,"b":0})
scatter_map.show()

In [None]:
# size by retweet_count
scatter_map = px.scatter_mapbox(
    geo_df, lat="lat", lon="lon", 
    size = 'retweet_count',
    size_max = 15,
    color='retweeted',
    hover_data=['full_text'],
    #color_discrete_sequence=["teal",""],
    zoom=1, height=500)
scatter_map.update_layout(mapbox_style="dark",
                          margin={"r":0,"t":0,"l":0,"b":0})
scatter_map.show()

In [None]:
hexabin_map = ff.create_hexbin_mapbox(data_frame=geo_df[['lat','lon']], lat="lat", lon="lon",
                                      nx_hexagon=25, opacity=0.5, labels={"color": "Relevant Tweets"},
                                      min_count=1, color_continuous_scale="Teal",
                                      show_original_data=True, height=500, zoom=0.95,
                                      original_data_marker=dict(size=5, opacity=0.7, color="Teal")
)
hexabin_map.update_layout(mapbox_style="carto-positron",
                          margin={"r":0,"t":0,"l":0,"b":0})
hexabin_map.show()

In [None]:
# find number of tweets by date
df['Date'] = pd.to_datetime(df['created_at']).dt.date
count_dates = df.groupby('Date').size().values
time_df = df.drop_duplicates(subset="Date").assign(Count=count_dates)
time_df = time_df[['Date','Count']].sort_values(by='Date')

In [None]:
line_fig = px.line(time_df, x='Date', y='Count', title='Relevant Tweets over time')

line_fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            #dict(count=6, label="6m", step="month", stepmode="backward"),
            #dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
line_fig.show()

## Plotly, Treemap

In [None]:
df = pd.read_csv('data/final_tweets.csv')
df.dropna(subset=['tokens'],inplace=True)
df['tokens'] = [literal_eval(s) for s in df['tokens']]

In [None]:
token_list = df['tokens'].tolist()
token_list = [token for sublist in token_list for token in sublist]

freq = FreqDist(token_list)
freq_df = pd.DataFrame(list(freq.items()), columns = ["Word","Occurrences"]) 
freq_df = freq_df[freq_df['Word']!=' ']
freq_df = freq_df[freq_df['Word']!='…']
freq_df = freq_df.sort_values('Occurrences',ascending=False)
freq_df[:20]

In [None]:
fig = go.Figure(go.Treemap(
    labels=freq_df['Word'][:20].tolist(),
    values=freq_df['Occurrences'][:20].tolist(),
    parents=['']*20,
    marker_colorscale=px.colors.sequential.Teal, # Burg, Darkmint, Mint, PuBu, Teal, YlGnBu, deep, ice, tempo
    hovertemplate='<b>%{label} </b> <br> Occurrences: %{value}<extra></extra>',
    pathbar={"visible": False},
))

fig.show()

## Dash application

In [None]:
df = pd.read_json('data/eng_merged.json',orient='split')
df.rename(columns={'relevant_LR':'Logistic regression','relevant_RF':'Random forest','relevant_CNN':'CNN','relevant_ULM':'ULMFiT'},inplace=True)
df = df[df['date'].dt.year>2016]
df.drop(['tweet_id','retweeted','tweet'], axis='columns', inplace=True)

In [None]:
#df.to_json('data/eng_tweets_1718.json',orient="split")
df = pd.read_json('data/eng_tweets_1718.json',orient="split")
df.columns

In [None]:
# Initiate app
app = JupyterDash(
    __name__, external_stylesheets=[dbc.themes.BOOTSTRAP],
    meta_tags=[{"name":"viewport","content":"width=device-width,initial-scale=1,maximum-scale=1.0,user-scalable=no"}]
)

server = app.server
app.config.suppress_callback_exceptions = True

https://github.com/s153748/extreme-weather-detection/blob/main/app.py

In [None]:
app.run_server(mode='external', port=8060, use_reloader=False, debug=True)
#app._terminate_server_for_port("localhost", 8060)

# Evaluation of Pipeline

In this section we load the English unlabelled tweets 

## Unlabelled tweets

Load and clean unlabbeled tweets.
This was done for one year at a time.

In [None]:
# load unlabelled tweets
years = ['2016','2017','2018']
year = years[0]

df1 = pd.read_json(f'Unlabbeled/eng_{year}.json', lines=True)

# add userinfo, tokens, retweet variables etc.
df2 = get_userinfo(df1)
df3 = get_tokens(df2)
df4 = add_vars(df3)

df_new=df4

# save to file
df_new.to_json('new.json',orient='split')

## Classify tweets

### Logistic Regression

In [None]:
def predict_relevance(new_tweet,model,clf_model):     
    if len(new_tweet)==1:
        rel = clf_model.predict(model.transform([new_tweet]))[0]
    else:
        rel = clf_model.predict(model.transform(new_tweet))
      
    return rel

In [None]:
# load the model from disk

clf_LR = pickle.load(open('Classifiers/LR.sav', 'rb'))
LR_tfidf =  pickle.load(open('Classifiers/LR_tfidf.sav', 'rb'))

labels = predict_relevance(df_new['tokens'],LR_tfidf,clf_LR)
df_new['relevant_LR'] = labels
print('Relevant tweets:', df_new['relevant_LR'].sum()/len(df_new)*100, '%')

### Random Forest


In [None]:
# load the model from disk

clf_RF = pickle.load(open('Classifiers/RF.sav', 'rb'))
RF_tfidf =  pickle.load(open('Classifiers/RF_tfidf.sav', 'rb'))

labels = predict_relevance(df_new['tokens'],RF_tfidf,clf_RF)
df_new['relevant_RF'] = labels
print('Relevant tweets:', df_new['relevant_RF'].sum()/len(df_new)*100, '%')

### CNN


In [None]:
def predictions(df,model,tokenizer):

    #tokenizer.fit_on_texts(df['full_text'].tolist())
    sequences = tokenizer.texts_to_sequences(df['text_DL'])

    MAX_SEQUENCE_LENGTH  = 30
    new_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions =model.predict_classes(new_data, batch_size=1024, verbose=1)

    prediction_labels =  [i[0] for i in predictions.tolist()]
            
    return  prediction_labels

In [None]:
# load json and create model

json_file = open('Classifiers/CNN.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
CNN = model_from_json(loaded_model_json)

# load tokenizer
json_file = open('Classifiers/CNN_tokenizer.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
CNN_tokenizer = tokenizer_from_json(loaded_model_json)

# load weights into new model

CNN.load_weights("Classifiers/CNN.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
CNN.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])


# get labels
labels = predictions(df_new,CNN,CNN_tokenizer)
df_new['relevant_CNN'] = labels
print('Relevant tweets:', df_new['relevant_CNN'].sum()/len(df_new)*100, '%')

### ULMFiT

In [None]:
# load ULMFiT

cl = load_learner('','Classifiers/ULMFiT.pkl')
cl.data.add_test(df_new['text_TL'])
preds, _ = cl.get_preds(ds_type=DatasetType.Test)
predictions = np.argmax(preds, axis=1)

df_new['relevant_ULM'] = predictions.tolist()
print('Relevant tweets:', df_new['relevant_ULM'].sum()/len(df_new)*100, '%')

In [None]:
# save to file 
#df_new.to_json('labelled.json',orient='split')

### Evaluation

In [None]:
print('Tweets:',len(df_new))
print('Relevant tweets LR:', df_new['relevant_LR'].sum()/len(df_new)*100, '%')
print('Relevant tweets RF:', df_new['relevant_RF'].sum()/len(df_new)*100, '%')
print('Relevant tweets CNN:', df_new['relevant_CNN'].sum()/len(df_new)*100, '%')
print('Relevant tweets ULMFiT:', df_new['relevant_ULM'].sum()/len(df_new)*100, '%')

print(60*'_')

df_new['sum_relevance'] = df_new[['relevant_LR','relevant_RF','relevant_CNN','relevant_ULM']].sum(axis=1)

print('At least one:',(df_new['sum_relevance']>0).sum()/len(df_new)*100, '%') 
print('Relevant tweets from all classifiers:', (df_new['sum_relevance']==4).sum()/len(df_new)*100, '%')
print('Irrelevant tweets from all classifiers:', (df_new['sum_relevance']==0).sum()/len(df_new)*100, '%')

## Get location

In [None]:
# get relevant tweets

df_rel = df_new[df_new['sum_relevance']!=0]
df_rel.drop(columns={'sum_relevance'},inplace=True)
print('Relevant tweets:',len(df_rel), '/', len(df_new))


# get geo coordinates
geo_df = df_rel[~df_rel['geo'].isna()]

#filter only relevant tweets
geo_df = geo_df[geo_df['relevant_CNN']==1].reset_index(drop=True)
geo_df['coords'] = [geo_df['geo'][i]['coordinates'] for i in range(len(geo_df))]

print('Geotagged tweets:',len(geo_df), '/', len(df_rel))


In [None]:
import time
threshold = 20
Look_up, df_last, final_df = localization(df_rel,threshold)


# save to fole
#final_df.to_json('eng_2015.json',orient='split')
#Look_up.to_csv('lookup_2015.csv',index=False)

### Evaluate

In [None]:
val,count = np.unique(final_df['localization'],return_counts=True)
N = len(final_df)
print('Geotagged coordinates: ', count[0]/N*100, '%'  )
print('Geotagged place: ', count[1]/N*100, '%' )
print('Geoparsed from text: ', count[2]/N*100, '%'  )
print('Registered user location: ', count[3]/N*100, '%'  )
print('No location:', (1-N/df_rel)*100, '%')

## Visualize

