**MASTER THESIS PROJECT**

*Identification and Exploration of Extreme Weather Events From Twitter Data*

# Imports

In [None]:
from __future__ import division, print_function
import pandas as pd
import numpy as np
from urllib.request import urlopen
import requests
import re
import ast 
from ast import literal_eval
import json
import os
import string  
import math
import random
import calendar
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
import seaborn as sns
from seaborn import color_palette
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
import collections
from googletrans import Translator
import altair as alt 
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
import folium
from folium import FeatureGroup, LayerControl, plugins, Map, Marker
from folium.plugins import FastMarkerCluster, MarkerCluster
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from nltk import word_tokenize
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding, SimpleRNN
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.backend import clear_session
import spacy
from spacymoji import Emoji
import keplergl
from keplergl import KeplerGl
import dash
import dash_bootstrap_components as dbc
import dash_core_components as dcc
import dash_html_components as html
from jupyter_dash import JupyterDash
from dash.dependencies import Input, Output, ClientsideFunction, State
from dash.exceptions import PreventUpdate
import pathlib
from flask import request
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
#from plotly.subplots import make_subplots
from ipywidgets import widgets
import warnings
warnings.filterwarnings("ignore")
%matplotlib notebook

# Load data

In [None]:
# load data
df = pd.read_csv('data/final_labelled_tweets.csv')

df.dropna(subset=['tokens'],inplace=True)
df['tokens'] = [literal_eval(s) for s in df['tokens']]
for i in range(len(df)):
    try:
        df['geo'][i] = eval(df['geo'][i])
    except:
        df['geo'][i] = np.nan
    try:
        df['place'][i] = eval(df['place'][i])
    except:
        df['place'][i] = np.nan

# Data pre-processing

CrisislexT6: https://crisislex.org/data-collections.html#CrisisLexT6

1. Alberta floods https://en.wikipedia.org/wiki/2013_Alberta_floods 
2. Queensland floods https://en.wikipedia.org/wiki/Cyclone_Oswald


In [None]:
# load json with tweets
floods = ['2013_Alberta_floods','2013_Queensland_floods'] # use cases

tweets_df = pd.DataFrame()
for flood in floods:

    df1 = pd.read_json(f'data/CrisisLexT6/{flood}_ids.json', lines=True)
    # rename
    df2 = df1.rename(columns={'id': 'tweet_id','is_quote_status':'quoted', 'in_reply_to_status_id':'reply_tweet_id'})
    # add variable for area
    area = flood.split('_')[1]
    df2['area'] = len(df2)*[area]
    tweets_df = pd.concat([tweets_df, df2])

# drop irrelevant columns
tweets_df.drop(['id_str','possibly_sensitive','entities','extended_entities','contributors','display_text_range','truncated','in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name'], axis=1, inplace=True)

print(f'Number of tweets: {len(tweets_df)}')
tweets_df.head(1)

In [None]:
tweets_df.info()

In [None]:
# load csv with labels for tweets
labels_df = pd.DataFrame()

for flood in floods:
    df1 = pd.read_csv(f'data/CrisisLexT6/{flood}.csv')
    df1['tweet_id'] = [int(t[1]) for t in df1['tweet id'].str.split("'")]
    labels_df = pd.concat([labels_df, df1])

labels_df = labels_df[labels_df.columns[2:]]
labels_df = labels_df.rename(columns={' label': 'relevant'})
labels_df = labels_df.replace('off-topic', 0).replace('on-topic', 1)
labels_df = labels_df[labels_df['tweet_id'].isin(tweets_df['tweet_id'])]

print(f'Number of tweets: {len(labels_df)}')
labels_df.head(3)

In [None]:
# merge tweets with their labels 
df_join = pd.merge(tweets_df, labels_df, on='tweet_id', how='inner').reset_index(drop=True)
df_join['user_id'] = [df_join['user'][i]['id'] for i in range(len(df_join))]

In [None]:
df_join['user'][0]

In [None]:
# get user information
users = [df_join['user'][i] for i in range(len(df_join))]
df_users = pd.DataFrame(users)
cols = ['id', 'name', 'screen_name', 'location', 'description', 'url', 'followers_count', 
        'friends_count', 'favourites_count', 'verified', 'statuses_count']
df_users = df_users[cols].drop_duplicates().reset_index(drop=True)
df_users = df_users.rename(columns={'id': 'user_id', 'name': 'user_name', 'screen_name': 'user_screen_name', 
                                    'location': 'user_location', 'description': 'user_description', 'url': 'user_url',
                                    'followers_count':'user_followers_count', 'friends_count':'user_friends_count',
                                    'favourites_count':'user_favourites_count', 'verified':'user_verified', 'statuses_count':'user_statuses_count'})
df_users.head(1)

In [None]:
# merge tweets with user information
df = pd.merge(df_join, df_users, how='inner', on='user_id').reset_index(drop=True)
df = df.replace(r'', np.NaN)
df.head(1)

In [None]:
df.info()

In [None]:
# remove duplicates of tweet_id
try: 
    before = len(df)
    df_drop = df.drop_duplicates(subset='tweet_id').reset_index(drop=True)
    print(f'Number of duplicates removed: {before-len(df_drop)}')
except:
    print('No duplicates found')
df = df_drop

In [None]:
# filter on English tweets only
df = df[df['lang']=='en'].reset_index(drop=True)
print(f'Number of English tweets: {len(df)}')

In [None]:
# tokenization of tweet text
df_token = df
df_token['tokens'] = ""

# for tokenization of emojies
nlp_spacymoji = spacy.load("en_core_web_sm")
emoji = Emoji(nlp_spacymoji, merge_spans=True)
nlp_spacymoji.add_pipe(emoji, first=True)

for i,content in enumerate(df_token['full_text']):
    if content:
        txt = content.translate(str.maketrans('', '', string.punctuation)) # remove punctuations
        doc = nlp_spacymoji(txt) # handle emojies 
        text = [token.text for token in doc] # split to tokens
        words1 = [t.lower() for t in text] # lower letters
        sw = stopwords.words("english") # set stopwords
        words2 = [t for t in words1 if t not in sw] # remove stopwords
        wordnet_lemmatizer = WordNetLemmatizer() # set lemmatizer
        words3 = [wordnet_lemmatizer.lemmatize(t) for t in words2]  # lemmatize
        words4 = [x for x in words3 if not any(c.isdigit() for c in x)] # remove words with numbers
        
        df_token['tokens'][i] = words4

In [None]:
df = df_token 

In [None]:
df.full_text[:3]

In [None]:
df.tokens[:3]

In [None]:
# check for NANs
df['tokens'].isna().sum()

In [None]:
# duplicates of full_text
dupl = df[df.duplicated(subset='full_text',keep=False)].sort_values("full_text")
print(f'Number of duplicate tweets: {len(dupl)} corresponding to {np.round(len(dupl)/len(df)*100,1)} %')

duplicates = dupl[['full_text']].groupby(dupl[['full_text']].columns.tolist()).size().reset_index().rename(columns={0:'duplicates'})
duplicates.sort_values("duplicates",ascending=False)

In [None]:
# merge duplicates counts on df
df = pd.merge(df,duplicates,on='full_text',how='outer').reset_index(drop=True)
df['duplicates'].fillna(0, inplace=True)

*Tweet data dictionary:* https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet]

Retweets: 
* `retweeted_status` Users can amplify the broadcast of Tweets authored by other users by retweeting. Retweets can be distinguished from typical Tweets by the existence of a retweeted_status attribute. This attribute contains a representation of the original Tweet that was retweeted. Note that retweets of retweets do not show representations of the intermediary retweet, but only the original Tweet. (Users can also unretweet a retweet they created by deleting their retweet.)
* `retweeted` Indicates whether this Tweet has been Retweeted by the authenticating user (using the retweet button)
* `retweet_count`: Number of times this Tweet has been retweeted

In [None]:
# number of tweets with retweeted as True
len(df[df['retweeted']==True])

In [None]:
# number of tweets with retweeted_status
df['retweeted_status'].fillna('none', inplace=True)
len(df[df['retweeted_status']!='none'])

In [None]:
df[['tweet_id','full_text','retweet_count','retweeted']].iloc[48]

In [None]:
# retweeted_status: representation of the original Tweet that was retweeted
df['retweeted_status'][48]

In [None]:
# number of tweets with full_text starting with "RT"
len([df['full_text'][i].startswith('RT') for i in range(len(df)) if df['full_text'][i].startswith('RT')==True])

In [None]:
# update retweeted variable and add original tweet id variable
df['original_tweet_id'] = ""
for i in range(len(df)):
    if df['retweeted_status'][i]!='none': 
        df['retweeted'][i] = True
        df['original_tweet_id'][i] = df['retweeted_status'][i]['id']
    elif df['full_text'][i].startswith('RT')==True:
        df['retweeted'][i] = True
    else:
        df['retweeted'][i] = False
        
num = sum(df['retweeted'])
print(f'Number of retweets: {num} corresponding to {np.round(num/len(df)*100,1)} %')

In [None]:
# retweet count does not seem reliable as retweeted = False for cases with retweet_count > 0
df[df['retweet_count']>0][['full_text','retweeted','retweeted_status','retweet_count']] 

Quotes:
* `quoted`: Indicates whether this is a Quoted Tweet

Favorites:
* `favorited`: Indicates whether this Tweet has been liked by the authenticating user
* `favorite_count`: Indicates approximately how many times this Tweet has been liked by Twitter users

Replies:
* `reply_tweet_id`: Contains the integer representation of the original Tweet’s ID

In [None]:
# number of tweets that are quoted
print(len(df[df['quoted']==True]))
df = df.drop(['quoted'], axis=1)

In [None]:
# number of tweets that are favorited
print(len(df[df['favorited']==True]))
df = df.drop(['favorited'], axis=1)

In [None]:
print(len(df[df['favorite_count']>0]))
df = df.drop(['favorite_count'], axis=1)

In [None]:
# number of replies 
print(df['reply_tweet_id'].fillna('none', inplace=True))
len(df[df['reply_tweet_id']!='none'])

In [None]:
# add boolean variable for whether a tweet is a reply
df['is_reply'] = ""
for i in range(len(df)):
    if df['reply_tweet_id'][i]!='none': 
        df['is_reply'][i] = True
    else:
        df['is_reply'][i] = False

In [None]:
# select numeric columns
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print(f'Numeric variables:\n {numeric_cols}')

# select non numeric columns
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print(f'Non-numeric variables:\n {non_numeric_cols}')

In [None]:
# find numbers and percentages of missing data
print(f'Total number of missing values: {df.isnull().sum().sum()}\n')

print('Number and percentage of missing values per variable:')
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())*100
    if pct_missing > 0:
        print(f'{col}: {df[col].isnull().sum()} - {np.round(pct_missing,2)} %')

In [None]:
# visualize the missing data with a heatmap
sns.set()
plt.figure(figsize=(12, 6))
heatmap = sns.heatmap(df[df.columns].isnull(), cmap=sns.color_palette(['#f7fbff', '#05264c']),cbar=False)
heatmap.set_xlabel('columns',fontsize=14)
heatmap.set_ylabel('rows',fontsize=14)
plt.show()

In [None]:
# create missing indicator for variables with missing data
for col in df.columns:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    if num_missing > 0:  
        df['{}_ismissing'.format(col)] = missing

# based on the indicator, plot the bar chart of missing values
ismissing_cols = [col for col in df.columns if 'ismissing' in col]
df['num_missing'] = df[ismissing_cols].sum(axis=1)
sns.set()
df['num_missing'].value_counts().reset_index().sort_values(by='index').plot.bar(
    x='index', y='num_missing', figsize=(12, 6), color='#05264c')
plt.xticks(rotation=0)
plt.xlabel('Number of missing values per obervation',fontsize=14)
plt.ylabel('Number of obervations',fontsize=14)
plt.legend('')
plt.show()

In [None]:
# for non-numeric variables fill missing values with empty strings 
#for col in non_numeric_cols:
#    missing = df[col].isnull()
#    num_missing = np.sum(missing)
#    if num_missing > 0:
#        print('Filling missing values for: {}'.format(col))
#        df['{}_ismissing'.format(col)] = missing
#        df[col] = df[col].fillna('')

# drop columns used to detect missing values
df = df.drop(ismissing_cols, axis=1)
df = df.drop('num_missing', axis=1)

In [None]:
# check repetitiveness
repetitive_cols = []
print('Percentage of the same value per variable:\n')
for col in df.columns:
    repetitive = (df[col].value_counts()/len(df.index)).iloc[0]
    print('{0}: {1:.1f}%'.format(col, repetitive*100))
    if repetitive > 0.8:
        repetitive_cols.append(col)

In [None]:
print('\nTop values of most repetitive variables:')
for col in repetitive_cols:
    display(df[col].value_counts()[:3])

In [None]:
# drop variable with only one distinct value
df = df.drop(['lang'], axis=1)

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d')
df['created_at_year'] = df['created_at'].dt.year
df['created_at_month'] = df['created_at'].dt.month
df['created_at_weekday'] = df['created_at'].dt.weekday
df.head(1)

In [None]:
# export to csv
df.to_csv('data/final_labelled_tweets.csv',index=False)

# Data exploration

In [None]:
# load data
df = pd.read_csv('data/final_labelled_tweets.csv')

df.dropna(subset=['tokens'],inplace=True)
df['tokens'] = [literal_eval(s) for s in df['tokens']]
for i in range(len(df)):
    try:
        df['geo'][i] = eval(df['geo'][i])
    except:
        df['geo'][i] = np.nan
    try:
        df['place'][i] = eval(df['place'][i])
    except:
        df['place'][i] = np.nan

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(df['retweet_count'], color='#05264c', rug=True, kde=True)
plt.xlabel('retweet_count',fontsize=14)
plt.ylabel('density',fontsize=14)
plt.tight_layout()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(y=df['retweet_count'],color='#05264c')
plt.ylabel('retweet_count',fontsize=14)
plt.tight_layout()

In [None]:
def bar_chart(col,num):
    df[col].value_counts()[:num].plot.barh(color='#05264c', figsize=(10, 5))
    plt.title(f'Top {num} {col}s')
    plt.gca().invert_yaxis()
    plt.xlabel('count')
    plt.ylabel(col)
    plt.tight_layout()

bar_chart('user_location',20)

In [None]:
sns.set()
pairplot = sns.pairplot(df[['retweet_count','user_followers_count']], diag_kind='kde', palette='#05264c')
plt.show()

In [None]:
# number of tweets over time
month_count = np.unique(df['created_at_month'],return_counts=True)
months = month_count[0]
plt.figure(figsize=(10, 5))
bins = np.arange(1,13)
plt.hist(df['created_at_month'], bins=bins, color='#05264c')
plt.ylabel('count')
plt.xlabel('month')
plt.xticks(bins)
plt.show()

# Localization



In this section, we look into getting a location for the tweets. This by 'geo', then 'place', then 'user_location'. For all, we try to get the location from the tweet text itself based on the TAGGS algorithm called geo-parsing  (https://link.springer.com/content/pdf/10.1007/s41651-017-0010-6.pdf) 

*Note that the “coordinates” attributes is formatted as [LONGITUDE, latitude], while the “geo” attribute is formatted as [latitude, LONGITUDE].

- Coordinates
- Place
- User location
- Geo-parsing

Geo (coordinate) attribute 

- Only relevant tweets

In [None]:
# filter only relevant tweets
geo_df = df[~df['geo'].isna()]
geo_df = geo_df[geo_df['relevant']==1].reset_index(drop=True)
geo_df['coords'] = [geo_df['geo'][i]['coordinates'] for i in range(len(geo_df))]

print(len(geo_df))

In [None]:
geo_df['lat'] = [geo_df['coords'][i][0] for i in range(len(geo_df))]
geo_df['lon'] = [geo_df['coords'][i][1] for i in range(len(geo_df))]
#geo_df.to_csv('data/geo_tweets.csv',index=False)

In [None]:
madrid = [40.416775, -3.703790]
locationlist = list(geo_df['coords'])
m = folium.Map(location=madrid, tiles='cartodbpositron', zoom_start=2)
marker_cluster = MarkerCluster().add_to(m)

for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point], popup=geo_df['full_text'][point]).add_to(marker_cluster)
m

Place attribute


In [None]:
#users with places
place_df = df_join[~df_join['place'].isna()]
print(len(place_df))

#places and NOT geo
geo_users = list(geo_df['user_id'])
place_df = place_df[~place_df['user_id'].isin(geo_users)].reset_index(drop=True)
print(len(place_df))

# add place_id
place_df['place_id'] =[place_df['place'][i]['id'] for i in range(len(place_df))]

# get dataframe with places metadata
places = [place_df['place'][i] for i in range(len(place_df))]
df_places= pd.DataFrame(places)

df_places= df_places.rename(columns={'id':'place_id'})
df_places = df_places.drop_duplicates(subset=['place_id'])

# merge to get all place details
place_df2 =pd.merge(place_df.drop(columns=['place']),df_places, on='place_id',how='inner')

print(len(place_df2))

In [None]:
userloc_df = df_join[~df_join['user_location'].isna()]

noloc_df = df_join[(df_join['user_location'].isna()) & (df_join['geo'].isna())  & (df_join['place'].isna()) ]
noloc_df2 = df_join[(df_join['geo'].isna()) & (df_join['place'].isna()) ]
len(noloc_df2)

In [None]:
print(f'total : {len(df_join)}')
print(30*'_' + '\n')

print(f'geotagged : {len(geo_df)}')
print(f'places : {len(place_df2)}')
print(f'user location : {len(userloc_df)}')
print(f'no location : {len(noloc_df)}')

TAGGS algorithm



In [None]:
dff= df_join[df_join['user_location'].isna()]
len(dff)

In [None]:
df_places['place_type'].unique()

In [None]:
for i in range(10):
    print(noloc_df['full_text'].iloc[i])

#  Text classification

1. Basic NLP algorithms; tf-idf; logistic classifier, naive bayes, SVM
2. Deep learning; word2vec; CNN
3. Transfer learning; ULMFiT, GPT-3

## Basic models

In [None]:
# relevant variables
df_rel = df[['full_text','tokens','relevant']]
df_rel.head()                                                               

In [None]:
X = df_rel['tokens']
y = df_rel['relevant']

print('Relevant:', sum(y), 'that is:' , round(sum(y)/len(y)*100,2), '%')
print('Non-relevant:', len(y)-sum(y), 'that is:' ,round((len(y)-sum(y))/len(y)*100,2),'%')

plt.figure(figsize=(10,4))
p1 = plt.barh(1,100,color='lightgreen')
p2 = plt.barh(1,(len(y)-sum(y))/len(y)*100,color='firebrick')

plt.legend((p1[0], p2[0]), ('Relevant', 'Non-relevant'),loc='upper center')
plt.yticks([1,1.8])
plt.title('Class balance',fontsize=16)
plt.show()

In [None]:
def text_fit(X, y, model, clf_model):   
    
    X_c = model.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0)
    
    print('* features: {}'.format(X_c.shape[1]))
    print('* train records: {}'.format(X_train.shape[0]))
    print('* test records: {}'.format(X_test.shape[0]))
   
    clf = clf_model.fit(X_train, y_train)
    pred = clf_model.predict(X_test)
    
    conf_mat = confusion_matrix(y_test.tolist(), pred)
    
    print('\nConfusion matrix')
    print(conf_mat)
    TN = conf_mat[0][0]
    FP = conf_mat[0][1]
    FN = conf_mat[1][0]
    TP = conf_mat[1][1]

    acc = (TP+TN)/(TP+TN+FP+FN)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    print('Accuracy:',(acc))
    print('Precision: ', prec)
    print('Recall: ', rec)
    
    return clf, X_c

Source: [https://www.kaggle.com/laowingkin/amazon-fine-food-review-sentiment-analysis](https://www.kaggle.com/laowingkin/amazon-fine-food-review-sentiment-analysis)

In [None]:
tfidf = TfidfVectorizer(preprocessor=' '.join, stop_words='english', lowercase=True)

print('Dummy classifier')
clf_base, X_c_base = text_fit(X, y, tfidf, DummyClassifier())
print(60*'__')

print('\nLogistic regression')
clf_log, X_c_log = text_fit(X, y, tfidf, LogisticRegression())
print(60*'__')

print('\nNaive Bayes')
clf_NB, X_c_NB = text_fit(X, y, tfidf, MultinomialNB())

print(60*'__')

In [None]:
def print_words(model, clf_model, out):
    w = model.get_feature_names()
    coef = clf_model.coef_.tolist()[0]
    coeff_df = pd.DataFrame({'Word': w, 'Coefficient': coef})
    coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
    if out == 1:
        print('')
        print('*Top 20 relevant*')
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print('*Top 20 non-relevant*')
        print(coeff_df.tail(20).to_string(index=False))

    return coeff_df

In [None]:
print('Logistic regression:')

_ = print_words(tfidf, clf_log, 1)

## CNN 
Source: https://towardsdatascience.com/cnn-sentiment-analysis-1d16b7c5a0e7

As our problem is a binary classification. We need to pass our model a two-dimensional output vector. For that, we add two one hot encoded columns to our data frame.

In [None]:
# relevant variables
df_rel = df[['full_text','tokens','relevant']] 
data = df_rel.rename(columns={'relevant': 'label'}, inplace=False)
rel = []
notrel = []
for l in data.label:
    if l == 0:
        rel.append(0)
        notrel.append(1)
    elif l == 1:
        rel.append(1)
        notrel.append(0)
data['relevant'] = rel
data['not_relevant'] = notrel
data.head()

In [None]:
# splitting data into test and train
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [None]:
# build training vocabulary
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

In [None]:
# build testing vocabulary 
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

In [None]:
# load google news Word2Vec model 
word2vec_path = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing))
    return list(embeddings)

In [None]:
# get embeddings
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [None]:
# parameters
MAX_SEQUENCE_LENGTH = 40
EMBEDDING_DIM = 300

In [None]:
# tokenize and pad sequences
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train['full_text'].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train['full_text'].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))

for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)

print(train_embedding_weights.shape)

In [None]:
test_sequences = tokenizer.texts_to_sequences(data_test["full_text"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# Defining CNN
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
 
    embedding_layer = Embedding(num_words,
                                embedding_dim,
                                weights=[embeddings],
                                input_length=max_sequence_length,
                                trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    convs = []
    filter_sizes = [2,3,4,5,6]
    
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, 
                        kernel_size=filter_size, 
                        activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
    
    l_merge = concatenate(convs, axis=1)
    
    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)
    
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [None]:
label_names = ['relevant','notrelevant']

x_train = train_cnn_data
y_train = data_train[label_names].values

x_test = test_cnn_data
y_test = data_test[label_names].values

In [None]:
model = ConvNet(train_embedding_weights, 
                MAX_SEQUENCE_LENGTH, 
                len(train_word_index)+1, 
                EMBEDDING_DIM, 
                len(list(label_names)))

In [None]:
# Training CNN
num_epochs = 3
batch_size = 32

clear_session()

hist = model.fit(x_train, 
                 y_train, 
                 epochs=num_epochs, 
                 validation_split=0.1, 
                 shuffle=True, 
                 batch_size=batch_size)

In [None]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)
labels = [1, 0]

prediction_labels = []
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

sum(data_test.label == prediction_labels)/len(prediction_labels)

In [None]:
data_test.label.value_counts()

In [None]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing accuracy:  {:.4f}".format(accuracy))

In [None]:
sns.set()

acc = hist.history['acc']
val_acc = hist.history['val_acc']
loss = hist.history['loss']
val_loss = hist.history['val_loss']

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
x = range(1, len(acc) + 1)
plt.plot(x, acc, 'b', label='Training acc')
plt.plot(x, val_acc, 'r', label='Validation acc')
plt.xticks(x)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(x, loss, 'b', label='Training loss')
plt.plot(x, val_loss, 'r', label='Validation loss')
plt.xticks(x)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred_class)
print(cm)

# Visualization

In [None]:
# filter only relevant tweets
geo_df = df[~df['geo'].isna()].reset_index(drop=True)
geo_df = geo_df[geo_df['relevant']==1].reset_index(drop=True)
for i in range(len(geo_df)):
    try: 
        geo_df['geo'][i] = eval(geo_df['geo'][i])
    except:
        geo_df['geo'][i] = geo_df['geo'][i]

#geo_df['coords'] = [geo_df['geo'][i]['coordinates'] for i in range(len(geo_df))]
geo_df['lat'] = [geo_df['geo'][i]['coordinates'][0] for i in range(len(geo_df))]
geo_df['lon'] = [geo_df['geo'][i]['coordinates'][1] for i in range(len(geo_df))]

#geo_df.to_csv('data/geo_tweets.csv',index=False)

## Folium, Leaflet, OpenStreetMap

In [None]:
# base map
m = folium.Map([20.416775, -3.70379], tiles=None, zoom_start=2)

# tile layers
folium.TileLayer('cartodbpositron', show=True, name="light").add_to(m)
folium.TileLayer('cartodbdark_matter', show=False, name="dark").add_to(m)
folium.TileLayer('openstreetmap', show=False, name="color").add_to(m)

# add location marker cluster
mc = MarkerCluster(name='Tweets').add_to(m)

# create marker at locations
for lat, lon, user_location, full_text, created_at, retweet_count in zip(geo_df['lat'], geo_df['lon'], geo_df['user_location'], 
                                     geo_df['full_text'], geo_df['created_at'], geo_df['retweet_count']):
    text = folium.Html('Tweet: {}<br> User location: {}<br> Created at: {}<br> Retweet count: {}<br>'.format(full_text, user_location, created_at, retweet_count), script=True)
    popup = folium.Popup(text, max_width=300)
    folium.CircleMarker(location = [lat, lon],
                        radius = 2,
                        weight = 5,
                        color = '#081d58',
                        fill_color = '#081d58',
                        fill = True,
                        popup = popup,
                        tooltip = 'Click on Tweet'
                        ).add_to(mc)
mc.add_to(m)

# add layer control
folium.LayerControl('topright', collapsed=True).add_to(m)

m

## kepler.gl

https://medium.com/nightingale/how-to-create-eye-catching-maps-with-python-and-kepler-gl-e7e897eff8ac

In [None]:
viz_cols = ['lon','lat','created_at','full_text','user_location']
kepler_map = keplergl.KeplerGl(height=500)
kepler_map.add_data(data=geo_df[viz_cols], name="Extreme weather events")
#kepler_map.save_to_html(file_name="kepler_map.html")
kepler_map

In [None]:
#config = kepler_map.config
#config

## Plotly, Mapbox

In [None]:
mapbox_token = 'pk.eyJ1IjoiczE1Mzc0OCIsImEiOiJja25wcDlwdjYxcWJmMnFueDhhbHdreTlmIn0.DXfj5S2H91AZEPG1JnHbxg'
px.set_mapbox_access_token(mapbox_token)

In [None]:
# point map
scatter_map = px.scatter_mapbox(geo_df, lat="lat", lon="lon", hover_name="full_text", 
                        hover_data=["created_at", "user_location",'retweeted'],
                        color_discrete_sequence=["teal"], 
                        zoom=1, height=500)
scatter_map.update_layout(mapbox_style="carto-positron",margin={"r":0,"t":0,"l":0,"b":0})
scatter_map.show()

In [None]:
scatter_map = px.scatter_mapbox(
    geo_df, lat="lat", lon="lon", 
    hover_data=['full_text',"created_at","user_location"],
    color = 'retweet_count',
    color_continuous_scale='teal',
    zoom=1, height=500)
scatter_map.update_layout(mapbox_style="dark",
                          margin={"r":0,"t":0,"l":0,"b":0})
scatter_map.show()

In [None]:
# size by retweet_count
scatter_map = px.scatter_mapbox(
    geo_df, lat="lat", lon="lon", 
    size = 'retweet_count',
    size_max = 15,
    color='retweeted',
    hover_data=['full_text'],
    #color_discrete_sequence=["teal",""],
    zoom=1, height=500)
scatter_map.update_layout(mapbox_style="dark",
                          margin={"r":0,"t":0,"l":0,"b":0})
scatter_map.show()

In [None]:
# color by retweet_count
colorscales = px.colors.named_colorscales()
#mapbox_token = open(".mapbox_token").read()

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = JupyterDash(__name__, external_stylesheets=external_stylesheets,
                  meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}])

server = app.server
#app.title = tabtitle
app.config.suppress_callback_exceptions = True

app.layout = html.Div([
    html.P("Color Scale"),
    dcc.Dropdown(id='colorscale',
                 options=[{"value": x, "label": x} for x in colorscales],
                 value='teal'),
    dcc.Graph(id="scatter_map"),
])

@app.callback(
    Output("scatter_map", "figure"),
    [Input("colorscale", "value")])
def change_colorscale(scale):
    fig = px.scatter_mapbox(geo_df, lat="lat", lon="lon",
                            color='retweet_count',
                            hover_data=["full_text", "user_location"],
                            color_continuous_scale=scale,
                            zoom=1, height=500)
    fig.update_layout(mapbox_style="dark",
                      margin={"r": 0, "t": 0, "l": 0, "b": 0},
                      hovermode='closest',
                      mapbox=dict(accesstoken=mapbox_token,
                                  bearing=0,
                                  pitch=0))
    return fig

app.run_server(mode='external', port=8060, use_reloader=False)

In [None]:
hexabin_map = ff.create_hexbin_mapbox(data_frame=geo_df[['lat','lon']], lat="lat", lon="lon",
                                      nx_hexagon=25, opacity=0.5, labels={"color": "Relevant Tweets"},
                                      min_count=1, color_continuous_scale="Teal",
                                      show_original_data=True, height=500, zoom=0.95,
                                      original_data_marker=dict(size=5, opacity=0.7, color="Teal")
)
hexabin_map.update_layout(mapbox_style="carto-positron",
                          margin={"r":0,"t":0,"l":0,"b":0})
hexabin_map.show()

In [None]:
# find number of tweets by date
df['Date'] = pd.to_datetime(df['created_at']).dt.date
count_dates = df.groupby('Date').size().values
time_df = df.drop_duplicates(subset="Date").assign(Count=count_dates)
time_df = time_df[['Date','Count']].sort_values(by='Date')

In [None]:
line_fig = px.line(time_df, x='Date', y='Count', title='Relevant Tweets over time')

line_fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            #dict(count=6, label="6m", step="month", stepmode="backward"),
            #dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
line_fig.show()

## Dash

In [None]:
app = JupyterDash(external_stylesheets=[dbc.themes.BOOTSTRAP])
app.layout = html.Div([
    dcc.Graph(figure=fig)
])
app.run_server(mode='external', port=8060, use_reloader=False) # debug=True

In [None]:
app._terminate_server_for_port("localhost", 8060)

In [None]:
{int(month): f'{calendar.month_name[int(month)][:3]} {str(year)[:4]}' for 
     year, month in zip(geo_df['created_at_year'], np.arange(1, 13))}

In [None]:
area_select = ['Alberta','Queensland']
geo_data = geo_df
geo_data[geo_data["area"].isin(area_select)].head(1)

In [None]:
filtered_data = geo_data[geo_data["area"].isin(area_select)]
filtered_data["lat"].tolist()
retweeted = filtered_data["retweeted"].tolist()
retweeted[0]

In [None]:
areas = geo_df["area"].unique()
options = [{"label": i, "value": i} for i in areas]
print([eval(areas[i]) for i in range(len(areas))])

In [1]:
import pandas as pd
import numpy as np
import calendar
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State #ClientsideFunction
from dash.exceptions import PreventUpdate
from jupyter_dash import JupyterDash # only in nb
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import ast 
from ast import literal_eval

In [2]:
# Initiate app
# app = dash.Dash()
app = JupyterDash(
    __name__,
    meta_tags=[{
            "name": "viewport",
            "content": "width=device-width, initial-scale=1, maximum-scale=1.0, user-scalable=no",
    }],
)
server = app.server
app.config.suppress_callback_exceptions = True

#external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
githublink='https://github.com/s153748/extreme-weather-detection'
mapbox_access_token = 'pk.eyJ1IjoiczE1Mzc0OCIsImEiOiJja25wcDlwdjYxcWJmMnFueDhhbHdreTlmIn0.DXfj5S2H91AZEPG1JnHbxg'

# Load data
df = pd.read_csv('data/final_labelled_tweets.csv')

# Data prep
df.dropna(subset=['tokens'], inplace=True)
df['tokens'] = [literal_eval(s) for s in df['tokens']]
for i in range(len(df)):
    try:
        df['geo'][i] = eval(df['geo'][i])
    except:
        df['geo'][i] = np.nan
    try:
        df['place'][i] = eval(df['place'][i])
    except:
        df['place'][i] = np.nan
    
geo_df = df[~df['geo'].isna()].reset_index(drop=True)
geo_df = geo_df[geo_df['relevant'] == 1].reset_index(drop=True)
for i in range(len(geo_df)):
    try:
        geo_df['geo'][i] = eval(geo_df['geo'][i])
    except:
        geo_df['geo'][i] = geo_df['geo'][i]

# Get coordinates
geo_df['lat'] = [geo_df['geo'][i]['coordinates'][0] for i in range(len(geo_df))]
geo_df['lon'] = [geo_df['geo'][i]['coordinates'][1] for i in range(len(geo_df))]

# Find number of tweets by date
df['Date'] = pd.to_datetime(df['created_at']).dt.date
count_dates = df.groupby('Date').size().values
time_df = df.drop_duplicates(subset="Date").assign(Count=count_dates).sort_values(by='Date')

# Set graph options
graph_list = ['Point map','Hexagon map']
area_list = geo_df["area"].unique()

def build_upper_left_panel():
    return html.Div(
        id="upper-left",
        className="four columns", 
        children=[
            html.P(
                className="section-title",
                children="Choose graph type or specific areas to inspect for the lists below",
            ),
            html.Div(
                className="control-row-1",
                children=[
                    html.Div(
                        id="graph-select-outer",
                        children=[
                            html.Label("Select Graph Type"),
                            dcc.Dropdown(
                                id="graph-select",
                                options=[{"label": i, "value": i} for i in graph_list],
                                value=graph_list[0],
                            ),
                        ],
                    ),
                ],
            ),
            html.Div(
                id="area-select-outer",
                className="control-row-2",
                children=[
                    html.Label("Select Area"),
                    html.Div(
                        id="checklist-container",
                        children=dcc.Checklist(
                            id="area-select-all",
                            options=[{"label": "Select All Areas", "value": "All"}],
                            value=[],
                        ),
                    ),
                    html.Div(
                        id="area-select-dropdown-outer",
                        children=dcc.Dropdown(
                            id="area-select", multi=True, searchable=True,
                        ),
                    ),
                ],
            ),
        ],
    )

def generate_geo_map(geo_data, month_select, graph_select, area_select):
    
    month_filtered = geo_data[geo_data.created_at_month == month_select]
    filtered_data = month_filtered[month_filtered["area"].isin(area_select)]
    
    if graph_select == 'Point map':
        fig = px.scatter_mapbox(filtered_data, 
                                lat="lat", 
                                lon="lon",
                                color='retweet_count',
                                size='retweet_count',
                                size_max=15,
                                height=500,
                                width=800,
                                hover_data=["full_text"],
                                color_continuous_scale='teal')
    else:
        fig = ff.create_hexbin_mapbox(data_frame=filtered_data, 
                                      lat="lat", 
                                      lon="lon",
                                      nx_hexagon=25, 
                                      opacity=0.5, 
                                      labels={"color": "Relevant Tweets"},
                                      min_count=1, 
                                      color_continuous_scale="teal",
                                      show_original_data=True, 
                                      height=500,
                                      width=800,
                                      original_data_marker=dict(size=5, opacity=0.7, color="teal")
        )
        
    fig.update_layout(
        margin=dict(l=10, r=10, t=20, b=10, pad=5),
        plot_bgcolor="#171b26",
        paper_bgcolor="#171b26",
        clickmode="event+select",
        hovermode="closest",
        showlegend=False,
        mapbox=go.layout.Mapbox(
            accesstoken=mapbox_access_token,
            bearing=10,
            center=go.layout.mapbox.Center(
                lat=filtered_data.lat.mean(), lon=filtered_data.lon.mean()
            ),
            pitch=5,
            zoom=2,
            style="mapbox://styles/plotlymapbox/cjvppq1jl1ips1co3j12b9hex",
        )
    )
        
    return fig

def generate_line_chart(time_data):
    fig = px.line(time_data,
                  x='Date',
                  y='Count',
                  title='Relevant Tweets Over Time')
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])))
    fig.update_layout(
        plot_bgcolor="#171b26",
        paper_bgcolor="#171b26",
        showlegend=False
    )
    return fig

# Set up the layout
app.layout = html.Div(
    className="container scalable",
    children=[
        html.Div(
            id="banner",
            className="banner",
            children=[
                html.H6("Extreme Weather Event Detection"),
                #html.Img(src=app.get_asset_url("plotly_logo_white.png")),
                html.A('View on Github', href=githublink),
            ],
        ),
        html.Div(
            id="upper-container",
            className="row",
            children=[
                build_upper_left_panel(),
                html.Div(
                    id="geo-map-outer",
                    className="four columns",
                    children=[
                        html.P(
                            id="map-title",
                            children="Spatial Development of Relevant Tweets"
                        ),
                        html.Div(
                            id="geo-map-loading-outer",
                            children=[
                                dcc.Loading(
                                    id="loading",
                                    children=[
                                        dcc.Graph(
                                            id="geo-map",
                                            figure={
                                                "data": [],
                                                "layout": dict(
                                                    plot_bgcolor="#171b26",
                                                    paper_bgcolor="#171b26",
                                                ),
                                            },
                                        ),
                                        dcc.Slider(
                                            id='month-slider',
                                            min=geo_df['created_at_month'].min(),
                                            max=geo_df['created_at_month'].max(),
                                            value=geo_df['created_at_month'].min(),
                                            marks={int(month): f'{calendar.month_name[int(month)][:3]} {str(year)[:4]}' for year, month in zip(
                                                geo_df['created_at_year'], geo_df['created_at_month'])},
                                            step=None
                                        ),
                                    ]
                                )
                            ],
                        ),
                    ],
                ),
            ],
        ),
       html.Div(
            id="lower-container",
            className="row",
            children=[
                html.Div(
                    id="line-chart-outer",
                    className="four columns",
                    children=[
                        html.P(
                            id="line-chart-title",
                            children="Temporal Development of Relevant Tweets"
                        ),
                        html.Div(
                            id="line-chart-loading-outer",
                            children=[
                                dcc.Loading(
                                    id="loading-line-chart",
                                    children=[
                                        dcc.Graph(
                                            id="line-chart",
                                            figure=generate_line_chart(time_df)
                                        )
                                    ]
                                )
                            ]
                        )
                    ]
                )
            ]
        )
    ]
)

@app.callback(
    [
        Output("area-select", "value"),
        Output("area-select", "options"),
        Output("map-title", "children"),
    ],
    [Input("area-select-all", "value"), Input("area-select", "value"),],
)
def update_area_dropdown(select_all, area_select):
    areas = geo_df["area"].unique()
    options = [{"label": i, "value": i} for i in areas]

    ctx = dash.callback_context
    if ctx.triggered[0]["prop_id"].split(".")[0] == "area-select-all":
        if select_all == ["All"]:
            value = [i["value"] for i in options]
        else:
            value = dash.no_update
    else:
        value = areas[:4]
    return (
        value,
        options,
        "Relevant Tweets in {}".format(', '.join([str(area) for area in area_select if len(area_select)>0]))
    )

@app.callback(
    Output("checklist-container", "children"),
    [Input("area-select", "value")],
    [State("area-select", "options"), State("area-select-all", "value")],
)
def update_checklist(selected, select_options, checked):
    if len(selected) < len(select_options) and len(checked) == 0:
        raise PreventUpdate()
    elif len(selected) < len(select_options) and len(checked) == 1:
        return dcc.Checklist(
            id="area-select-all",
            options=[{"label": "Select All Areas", "value": "All"}],
            value=[],
        )
    elif len(selected) == len(select_options) and len(checked) == 1:
        raise PreventUpdate()
    return dcc.Checklist(
        id="area-select-all",
        options=[{"label": "Select All Areas", "value": "All"}],
        value=["All"],
    )

@app.callback(
    Output('geo-map', 'figure'),
    [
        Input('month-slider', 'value'),
        Input("graph-select", "value"),
        Input("area-select", "value"),
    ],
)
def update_geo_map(month_select, graph_select, area_select):
    
    return generate_geo_map(geo_df, month_select, graph_select, area_select)

app.run_server(mode='external', port=8060, use_reloader=False)
#app.run_server()



Dash app running on http://127.0.0.1:8060/


In [10]:
"Relevant Tweets in {}".format(', '.join([str(area) for area in area_select if len(area_select)>0]))

'Relevant Tweets in Alberta'

In [9]:
area_select = ['Alberta']

In [None]:
"AI for Climate Adaptation (AI4CA)"
"Identification and Exploration of Extreme Weather Events From Twitter Data"

In [None]:
controls = dbc.FormGroup(
    [
        html.P('Dropdown', style={
            'textAlign': 'center'
        }),
        dcc.Dropdown(
            id='dropdown',
            options=[{
                'label': 'Value One',
                'value': 'value1'
            }, {
                'label': 'Value Two',
                'value': 'value2'
            },
                {
                    'label': 'Value Three',
                    'value': 'value3'
                }
            ],
            value=['value1'],  # default value
            multi=True
        ),
        html.Br(),
        html.P('Check Box', style={
            'textAlign': 'center'
        }),
        dbc.Card([dbc.Checklist(
            id='check_list',
            options=[{
                'label': 'Value One',
                'value': 'value1'
            },
                {
                    'label': 'Value Two',
                    'value': 'value2'
                },
                {
                    'label': 'Value Three',
                    'value': 'value3'
                }
            ],
            value=['value1', 'value2'],
            inline=True
        )]),
        html.Br(),
        html.P('Radio Items', style={
            'textAlign': 'center'
        }),
        dbc.Card([dbc.RadioItems(
            id='radio_items',
            options=[{
                'label': 'Value One',
                'value': 'value1'
            },
                {
                    'label': 'Value Two',
                    'value': 'value2'
                },
                {
                    'label': 'Value Three',
                    'value': 'value3'
                }
            ],
            value='value1',
            style={
                'margin': 'auto'
            }
        )]),
        html.Br(),
        dbc.Button(
            id='submit_button',
            n_clicks=0,
            children='Submit',
            color='primary',
            block=True
        ),
    ]
)


# Sidebar
sidebar = html.Div(
    [
        html.H2('Config', style=TEXT_STYLE),
        html.Hr(),
        controls
    ],
    style=SIDEBAR_STYLE,
)

# Content
content = html.Div(
    [
        html.H2('Extreme Weather Events Dashboard', style=TEXT_STYLE),
        html.Hr(),
        content_first_row
        content_second_row,
        content_third_row
    ],
    style=CONTENT_STYLE
)

content_first_row = dbc.Row([
    dbc.Col(
        dbc.Card(
            [

                dbc.CardBody(
                    [
                        html.H4(id='card_title_1', children=['Card Title 1'], className='card-title',
                                style=CARD_TEXT_STYLE),
                        html.P(id='card_text_1', children=['Sample text.'], style=CARD_TEXT_STYLE),
                    ]
                )
            ]
        ),
        md=3
    ),
    dbc.Col(
        dbc.Card(
            [

                dbc.CardBody(
                    [
                        html.H4('Card Title 2', className='card-title', style=CARD_TEXT_STYLE),
                        html.P('Sample text.', style=CARD_TEXT_STYLE),
                    ]
                ),
            ]

        ),
        md=3
    ),
    dbc.Col(
        dbc.Card(
            [
                dbc.CardBody(
                    [
                        html.H4('Card Title 3', className='card-title', style=CARD_TEXT_STYLE),
                        html.P('Sample text.', style=CARD_TEXT_STYLE),
                    ]
                ),
            ]

        ),
        md=3
    ),
    dbc.Col(
        dbc.Card(
            [
                dbc.CardBody(
                    [
                        html.H4('Card Title 4', className='card-title', style=CARD_TEXT_STYLE),
                        html.P('Sample text.', style=CARD_TEXT_STYLE),
                    ]
                ),
            ]
        ),
        md=3
    )
])

content_second_row = dbc.Row(
    [
        dbc.Col(
            dcc.Graph(id='scatter_map'), md=12,
        )
    ]
)

content_third_row = dbc.Row(
    [
        dbc.Col(
            dcc.Graph(id='line_map'), md=6
        ),
        dbc.Col(
            dcc.Graph(id='graph'), md=6
        )
    ]
)

In [None]:
# Colors
colors = {
    'background': '#262B3D',
    'text': '#FFF'
}

# Description
def description_card():
    return html.Div(
        id="description-card",
        children=[html.H3(children="Extreme Weather Event Detection", style={'color': colors['text']}),
                  html.Div(id="intro", style={'color': colors['text']},
                           children="Explore the Tweets identified relevant to a extreme weather event. Click on the map to visualize Tweets at different time points.",
)])

In [None]:
fig = px.scatter_mapbox(
        geo_data, lat="lat", lon="lon", 
        hover_data=['full_text',"created_at","user_location"],
        color = 'retweet_count',
        color_continuous_scale='teal',
        zoom=1, 
        height=500
    )
        
    layout = go.Layout(
        margin=dict(l=10, r=10, t=20, b=10, pad=5),
        plot_bgcolor="#171b26",
        paper_bgcolor="#171b26",
        clickmode="event+select",
        hovermode="closest",
        showlegend=False,
        mapbox=go.layout.Mapbox(
            accesstoken=mapbox_access_token,
            bearing=10,
            center=go.layout.mapbox.Center(
                lat=filtered_data.lat.mean(), lon=filtered_data.lon.mean()
            ),
            pitch=5,
            zoom=1,
            style="mapbox://styles/plotlymapbox/cjvppq1jl1ips1co3j12b9hex",
        ),
    )
    return {"data": fig, "layout": layout}