# Library

## PIP

In [1]:
pip install -q spacy

In [2]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.1MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180945 sha256=439467dd468a78d996a4a00c04e738c1be09dd0081e099571df57bf27c8f3e79
  Stored in directory: /tmp/pip-ephem-wheel-cache-4ozd8h8j/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
pip install -q git+https://github.com/LIAAD/yake

[?25l[K     |███▌                            | 10kB 30.3MB/s eta 0:00:01[K     |███████                         | 20kB 34.9MB/s eta 0:00:01[K     |██████████▌                     | 30kB 37.3MB/s eta 0:00:01[K     |██████████████                  | 40kB 25.0MB/s eta 0:00:01[K     |█████████████████▌              | 51kB 21.3MB/s eta 0:00:01[K     |█████████████████████           | 61kB 22.5MB/s eta 0:00:01[K     |████████████████████████▌       | 71kB 21.0MB/s eta 0:00:01[K     |████████████████████████████    | 81kB 19.9MB/s eta 0:00:01[K     |███████████████████████████████▌| 92kB 21.4MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 10.3MB/s 
[?25h  Building wheel for yake (setup.py) ... [?25l[?25hdone
  Building wheel for segtok (setup.py) ... [?25l[?25hdone


In [4]:
pip install -q geopy

## Import

In [5]:
# System
from collections import Counter
from string import punctuation
import pandas as pd
import math
import statistics
import copy
import functools
import operator
import itertools
import re
import json
import string
import numpy as np
from datetime import datetime, date
from itertools import product, permutations
import time
import traceback

# NLP
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import spacy
import en_core_web_lg
import yake
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('universal_tagset')

# Locations
from geopy.geocoders import Nominatim

# Event Merging
from sklearn.cluster import DBSCAN

# Evaluation
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from geopy import distance

# Utilities
from pprint import pprint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


## Global Variables

In [6]:
spacy_nlp = en_core_web_lg.load()
geolocator = Nominatim(user_agent="thesis_shandy", timeout=None)
lmtzr = WordNetLemmatizer()
stop_words = stopwords.words('english')
event_merging_pairs = []

## Global Constants

In [25]:
PATH_LOC_CITIES = "./datasets/cities.json"
PATH_LOC_COUNTRIES = "./datasets/countries.csv"
PATH_DATASET_AYLIEN = "./datasets/aylien.json"
PATH_EVAL_HAILSTORM = "./evaluations/hailstorm.csv"

## Global Class

In [26]:
class LocationApi:
    # Constants
    PATH_CITIES = PATH_LOC_CITIES
    PATH_CENTROIDS = PATH_LOC_COUNTRIES
    df_cities = None
    df_countries = None

    def __init__(self):
        self.initialize()

    def initialize(self):
        # Initialize Cities
        self.df_cities = pd.read_json(self.PATH_CITIES, lines=True)

        # Initialize countries
        centroids = pd.read_csv(self.PATH_CENTROIDS)
        self.df_countries = centroids[['name', 'Longitude', 'Latitude']]

    # Utilities
    def csim_loc(self, type_loc, name):
        if type_loc == 'city':
            df = self.df_cities[self.df_cities['name'].str.startswith(name[:3].capitalize())]
        else: # type_loc == 'country':
            df = self.df_countries[self.df_countries['name'].str.startswith(name[:3].capitalize())]

        sentences = df['name'].tolist()
        sentences.append(name.capitalize())
        vectorizer = CountVectorizer().fit_transform(sentences)
        vectors = vectorizer.toarray()
        csim = cosine_similarity(vectors)
        csim = csim[-1].tolist()
        csim.pop()
        return df, csim

    def coord_loc(self, type_loc, name):
        df, csim = self.csim_loc(type_loc, name)
        df_res = df.iloc[csim.index(max(csim))]
        return df_res

    # Mains
    def is_country(self, name):
        df, csim = self.csim_loc('country', name)
        return any(x >= 0.5 for x in csim)

    def is_city(self, name):
        df, csim = self.csim_loc('city', name)
        return any(x >= 0.5 for x in csim)

    def get_coord_city(self, name):
        df = self.coord_loc('city', name)
        return {
            "city": df['name'],
            "country": df['country'],
            "lat": df['lat'],
            "lng": df['lng']
        }

    def get_coord_country(self, name):
        df = self.coord_loc('country', name)
        return {
            "city": "",
            "country": df['name'],
            "lat": df['Latitude'],
            "lng": df['Longitude']
        }

## Helper Functions

In [8]:
def preprocessing_text(text):
    text = text.lower() # Lowercase
    text = re.sub(r'\d+', '', text) # Remove number
    text = text.translate(str.maketrans(dict.fromkeys(string.punctuation)))  
    text = text.strip() # Remove whitespace
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens_raw = word_tokenize(text)
    tokens = [i for i in tokens_raw if not i in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens_lemma = []
    for word in tokens:
        tokens_lemma.append(lemmatizer.lemmatize(word))
    return " ".join(tokens_lemma)

In [9]:
def days_between(d1, d2):
    return abs((d2 - d1).days)

In [10]:
def ll_to_cartesian(lat, lon):
    x = np.cos(lat) * np.cos(lon)
    y = np.cos(lat) * np.sin(lon)
    z = np.sin(lat)
    return x, y, z

def cartesian_to_ll(x, y, z):
    lon = np.arctan2(y, x)
    hyp = np.sqrt( (x*x) + (y*y))
    lat = np.arctan2(z, hyp)
    lat = lat * 180 / 3.14
    lon = lon * 180 / 3.14
    return lat, lon

# Implementation

## TFIDF Dictionary

In [11]:
# Return feature names and tfidf
def create_tfidf_dict(df_news):
    corpus = [preprocessing_text(body) for body in df_news['body'].tolist()]
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    tfidf_dict = {
        "features": vectorizer.get_feature_names(),
        "scores": X.toarray()
    }
    return tfidf_dict

## Keyword Extraction

In [12]:
def keywords_extraction_original(texts):
    keywords = []
    
    extractor = yake.KeywordExtractor(n=1, top=10)

    for text in texts:
        kw_yake = extractor.extract_keywords(text)
        kw_yake.reverse()
        kw_text = []
        for kw in kw_yake:
            kw_text.append(kw[0])
        keywords.append(kw_text)
    
    return keywords

def keywords_extraction_enhanced(texts, tfidf_dict):
    keywords = []
    
    extractor = yake.KeywordExtractor(n=1, top=10)

    for idx, text in enumerate(texts):

        # Extract keyword using Yake
        kw_yake = []
        for kw in extractor.extract_keywords(text):
            try:
                kw_text = kw[0]
                kw_index = tfidf_dict['features'].index(kw_text)
                tfidf_score = tfidf_dict['scores'][idx][kw_index]
                if tfidf_score > 0.1:
                    kw_yake.append(kw_text)
            except ValueError:
                continue
                
        keywords.append(kw_yake)

    return keywords

## Keyword Similarity

In [13]:
# word1 and word2 are string
def is_words_similar(word1, word2):
    # WUP
    try:
        syn1 = wn.synsets(word1)[0]
        syn2 = wn.synsets(word2)[0]

        # Return false if one of the two words has a/r (adjective/adverb) POS
        allowed_pos = ['n', 'v']
        if (syn1.pos() not in allowed_pos) or (syn2.pos() not in allowed_pos):
            return False

        # return false if the two words have different POS
        if syn1.pos() != syn2.pos():
            return False

        if syn1.wup_similarity(syn2) > 0.9:
            return True
        else:
            return False
    except IndexError: # One of the keywords doesn't has meaning (ex: kenneth)
        return False

## Hot Event Detection

In [14]:
def similarity_original(keywords_article, keywords_event):
    # Get the sum of weights in event keywords
    ev_sum = 0
    same_sum = 0

    for ev_key in keywords_event:
        ev_sum = ev_sum + ev_key['weight']

        for ar_key in keywords_article:
            if ar_key['keyword'] == ev_key['keyword']:
                same_sum = same_sum + ev_key['weight']
    
    try:
        return (same_sum/ev_sum)
    except ZeroDivisionError:
        return 0

def similarity_enhanced(keywords_article, keywords_event):
    # Get the sum of weights in event keywords
    ev_sum = 0
    same_sum = 0

    for ev_key in keywords_event:
        ev_sum = ev_sum + ev_key['weight']

        for ar_key in keywords_article:
            if is_words_similar(ev_key['keyword'], ar_key['keyword']):
                same_sum = same_sum + ev_key['weight']

    # Calculate similarity
    try:
        return(same_sum/ev_sum)
    except ZeroDivisionError:
        return 0

def hot_event_detection(df_article, threshold, attenuation, semantic_similarity):
    event_library = []
    event_index_df = []
    event_index_df_sim = []


    for article_index, article in df_article.iterrows():
        article_keywords = article['keywords']

        # First news is stored as an event in the event library
        if len(event_library) == 0:
            event_library.append(article_keywords)
            event_index_df.append(0)
            event_index_df_sim.append(1)
        else:
            max_sim_score = -999
            max_sim_event = -1
            
            # Count article similarity to each event 
            for event_index, event_keywords in enumerate(event_library):
                # Count similarity
                if semantic_similarity == True:
                  sim = similarity_enhanced(article_keywords, event_keywords)
                else:
                  sim = similarity_original(article_keywords, event_keywords)

                if sim > max_sim_score:
                    max_sim_score = sim
                    max_sim_event = event_index
            
            # Compare max similarity score to threshold
            if max_sim_score < threshold:
                # Store article as a new event
                event_library.append(article_keywords)
                event_index_df.append(len(event_library)-1)
                event_index_df_sim.append(1)
            else:
                # Adjust event's keywords weight
                changed_weight_index = [] # Get indexes of changed keywords
                event_keywords = event_library[max_sim_event]

                for i in range(len(event_keywords)):
                    # If a keyword in event is same with a keyword in article
                    # Increment the weight of that event keyword weight by 1
                    for article_keyword in article_keywords:
                        event_keyword_text = event_keywords[i]['keyword']
                        article_keyword_text = article_keyword['keyword']

                        if semantic_similarity == True:
                          if is_words_similar(event_keyword_text, article_keyword_text):
                              event_library[max_sim_event][i]['weight'] += 1 # Increment weight
                              changed_weight_index.append(i)
                        else:
                          if event_keyword_text == article_keyword_text:
                              event_library[max_sim_event][i]['weight'] += 1 # Increment weight
                              changed_weight_index.append(i)
                
                # Attenuate the rest of unchanged event keyword by attenuation rate
                for i in range(len(event_library[max_sim_event])):
                    if i in changed_weight_index:
                        continue
                    else:
                        event_library[max_sim_event][i]['weight'] -= attenuation

                # Remove the keywords that have weight under attenuation rate
                new_keywords = []
                for keyword in event_library[max_sim_event]:
                    if keyword['weight'] > attenuation:
                        new_keywords.append(keyword)
                event_library[max_sim_event] = new_keywords

                # Add event index to list for df
                event_index_df.append(max_sim_event)
                event_index_df_sim.append(max_sim_score)
                

    df_result = df_article.copy(deep=True)
    df_result['event'] = event_index_df
    df_result['similarity'] = event_index_df_sim
    return df_result, dict(enumerate(event_library))

## Locations and Dates

In [15]:
def ner(df_in):
    df_out = copy.deepcopy(df_in)
    locations_all = []

    # Spacy
    for index, article in df_in.iterrows():
        doc = spacy_nlp(article['body'])
        locs = []
        for ent in doc.ents:
            if ent.label_ == "GPE":
                locs.append(ent.text)
        locations_all.append(locs)
    
    df_out['locations'] = locations_all

    return df_out

In [16]:
def get_events_dates(df_news, events_locations):
    dates = {}

    for cluster_idx in events_locations.keys():
        # filter news per event
        df_cluster = df_news[df_news['event'] == int(cluster_idx)]

        potential_date = []
        for index, article in df_cluster.iterrows():
            potential_date.append(article['publication_time'].date())

        # Get the 5 highest frequency dates (not sorted)
        five_most_common_dates = Counter(potential_date).most_common(5)

        dates[cluster_idx] = []
        for com_date in five_most_common_dates:
            dates[cluster_idx].append(com_date)

    return dates

In [17]:
## Locations and Times
def get_events_locations(df_news, events_library):

    #
    # Get Locations From the 5 Highest Frequency
    #
    locations_potential = {}

    for cluster_idx in events_library.keys():
        # Get articles from cluster
        df_cluster = df_news[df_news['event'] == int(cluster_idx)]

        # Flatten the list then count the frequency
        locs = functools.reduce(operator.iconcat, df_cluster['locations'].tolist(), [])
        locs = [x.lower() for x in locs]
        locations_potential[cluster_idx] = Counter(locs).most_common(5)
    
    locations = {}
    locapi = LocationApi()
    for cluster_idx in locations_potential.keys():
        if locations_potential[cluster_idx] == []:
            continue
    
        # Convert locations text to {name, country, lat, lng}
        locations[cluster_idx] = []
        for location in locations_potential[cluster_idx]:
            loc = location[0].capitalize()
            try:
                if locapi.is_city(loc):
                    final_loc = locapi.get_coord_city(loc)
                    final_loc['count'] = location[1]
                    locations[cluster_idx].append(final_loc)
                elif locapi.is_country(loc):
                    final_loc = locapi.get_coord_country(loc)
                    final_loc['count'] = location[1]
                    locations[cluster_idx].append(final_loc)
                else:
                    georesult = geolocator.geocode(loc, language='en')
                    if georesult is not None:
                        locations[cluster_idx].append({
                            "city": loc,
                            "country": georesult.address.split(", ")[-1],
                            "lat": georesult.latitude,
                            "lng": georesult.longitude,
                            "count": location[1]
                        })
            except ValueError:
                continue

    return locations

## Event Merging

In [18]:
def get_events_location_merging(events_library):
    for ev_key in events_library.keys():
        # Get dates with highest freq
        highest_freq = events_library[ev_key]['locations'][0]['count'] # Locations already sorted from highest freq
        highest_freq_list = []
        for loc in events_library[ev_key]['locations']:
            if loc['count'] == highest_freq:
                highest_freq_list.append(loc)
        
        events_library[ev_key]['loc_lat'] = highest_freq_list[0]['lat']
        events_library[ev_key]['loc_lng'] = highest_freq_list[0]['lng']

    return events_library

def get_events_date_merging(df_news, events_library):
    for ev_key in events_library.keys():
        # Get dates with highest freq
        highest_freq = events_library[ev_key]['dates'][0][1] # Dates already sorted from highest freq
        highest_freq_list = []
        for date in events_library[ev_key]['dates']:
            if date[1] == highest_freq:
                highest_freq_list.append(date)
        
        # If date with highest freq only 1, use it
        if len(highest_freq_list) == 1:
            events_library[ev_key]['highest_freq_date'] = highest_freq_list[0][0]
        # More than one dates with the highest freq
        elif len(highest_freq_list) > 1:
            sim_freq = []
            for com_date in highest_freq_list:
                # Get all news in the cluster with the date
                df_cluster = df_news[df_news['event'] == int(ev_key)]
                df_date = df_cluster[df_cluster['publication_time'].dt.strftime('%Y-%m-%d') == com_date[0]]

                # Count the relevances mean
                sim_freq.append((com_date[0], com_date[1], df_date['similarity'].mean()))

            # Sort based on similarity inplace
            sim_freq.sort(key=lambda tup: tup[-1], reverse=True)
            events_library[ev_key]['highest_freq_date'] = sim_freq[0][0]

    return events_library

In [19]:
def detect_merge(event_lib):
    merge_potential = []
    event_lib = event_lib.sort_values('highest_freq_date')
    i = 0
    j = 1
    for idx1, row1 in event_lib.iterrows():
        for idx2, row2 in event_lib.iterrows():
            if idx1 != idx2:
                distance_date = days_between(row1['highest_freq_date'], row2['highest_freq_date'])
                loc1 = (row1['loc_lat'], row1['loc_lng'])
                loc2 = (row2['loc_lat'], row2['loc_lng'])
                distance_location = distance.distance(loc1, loc2).km
                if distance_date <= 2 and distance_location <= 50:
                    merge_potential.append((idx1,idx2))
                    event_merging_pairs.append({
                        "loc1": {
                            "lat": row1['loc_lat'],
                            "lng": row1['loc_lng'],
                            "date": row1['highest_freq_date']
                        },
                        "loc2": {
                            "lat": row2['loc_lat'],
                            "lng": row2['loc_lng'],
                            "date": row2['highest_freq_date']
                        },
                        "distance": {
                            "location": distance_location,
                            "date": distance_date
                        }
                    })


    return list(set(tuple(sorted(l)) for l in merge_potential))

In [20]:
def get_merge_potential(df_news, events_library):
    events_library = get_events_location_merging(events_library)
    events_library = get_events_date_merging(df_news, events_library)
    lib_temp = pd.DataFrame.from_dict(events_library, orient='index')

    # Cluster per location using KMeans with n_clusters/2
    ev_locs = []
    for ev_key in events_library.keys():
        lat = events_library[ev_key]['loc_lat']
        lng = events_library[ev_key]['loc_lng']
        ev_locs.append([lat, lng])
    
    # DBSCAN
    clustering = DBSCAN(eps=10, min_samples=1).fit(ev_locs)
    lib_temp['merging_locs'] = clustering.labels_

    # Cluster time per location cluster
    to_merge_list = []
    for loc in list(set(clustering.labels_)):
        event_temp = lib_temp[lib_temp['merging_locs'] == loc] # Filter per location cluster
        to_merge_list.append(detect_merge(event_temp))
    
    to_merge_list_flat = [item for sublist in to_merge_list for item in sublist] # Flatten to merge list
    to_merge_list_flat.sort()

    return to_merge_list_flat 

In [21]:
def ev_merging(df_news, events_library, events_locations):
    to_merge_pairs = get_merge_potential(df_news, events_library) # Get pairs of similar event clusters
    events_list = df_news['event'].tolist()

    # Renewing event column in df_news
    if to_merge_pairs != []:
        new_idxs = []
        for idx_pair in range(len(to_merge_pairs)-1, -1, -1):
            old_idx = to_merge_pairs[idx_pair][1]
            new_idx = to_merge_pairs[idx_pair][0]
            events_library[new_idx]['locations'] = events_library[new_idx]['locations'] + events_library[old_idx]['locations']
            events_library[new_idx]['dates'] = events_library[new_idx]['dates'] + events_library[old_idx]['dates']
            events_list = [new_idx if idx==old_idx else idx for idx in events_list]
        df_news['event'] = events_list
    
    # For news in events without locations, assign event with -1
    for idx, news in df_news.iterrows():
        if news['event'] not in events_locations.keys():
            df_news.at[idx, 'event'] = -1

    # Delete clusters without news in event library
    to_delete_clusters = []
    for cluster_idx in events_library.keys():
        if len(df_news[df_news['event'] == cluster_idx]) == 0:
            try:
                to_delete_clusters.append(cluster_idx)
            except KeyError:
                continue
    for cluster_idx in to_delete_clusters:
        try:
            del events_library[cluster_idx]
        except KeyError:
            continue

    return df_news, events_library

## Main

In [22]:
def main_enhanced(path, threshold, attenuation, query='', keywords_analysis=True, semantic_similarity=True, event_merging=True):
    # Starting perf counter
    time_start = time.perf_counter()

    # Read News Dataset
    print("PROCESS:\t Reading dataset...")
    df_news = pd.read_json(path, orient='records')

    # News' body filter feature
    print("PROCESS:\t Filtering news with query...")
    if query != '':
        df_news = df_news[df_news['body'].str.contains(pat=query, case=False)]
        df_news = df_news.reset_index(drop=True)

    print("RESULT:\t\t Detected", df_news.shape[0], " with", query, "topic")

    # Keywords Extraction
    if keywords_analysis == True:
      # Creating TFIDF Dictionary
      print("PROCESS:\t Extracting and Analyzing Keywords...")
      tfidf_dict = create_tfidf_dict(df_news)
      df_keywords = keywords_extraction_enhanced(df_news['body'], tfidf_dict=tfidf_dict)
    else:
      print("PROCESS:\t Extracting keywords...")
      df_keywords = keywords_extraction_original(df_news['body'])

    print("PROCESS:\t Initializing Keyword's weights...")
    df_keywords_weight = []
    for keywords in df_keywords:
        weighted_keywords = [] # {"keyword":<keyword>, "weight":<weight>}
        for keyword in keywords:
            weighted_keywords.append({
                "keyword": keyword,
                "weight": 1
            })
        
        df_keywords_weight.append(weighted_keywords)

    df_news['keywords'] = df_keywords_weight

    # Hot Event Detection (Modified)
    print("PROCESS:\t Detecting events...")
    df_news, events_library = hot_event_detection(df_news, threshold, attenuation, semantic_similarity)
    df_news['keywords'] = df_keywords
    if event_merging == True:
        print("RESULT:\t\t Detected", len(df_news['event'].unique())-1, "events (before event merging)")
    else:
        print("RESULT:\t\t Detected", len(df_news['event'].unique())-1, "events")

     # Getting locations and time
    print("RESULT:\t\t Getting locations and times...")
    df_news = ner(df_news) # Getting location(s) and time(s) per document
    events_locations = get_events_locations(df_news, events_library)
    
    # Remove events that doesn't have locations
    to_remove_events = []
    for ev_key in events_locations.keys():
        detected_events = df_news['event'].tolist()
        if events_locations[ev_key] == []:
            to_remove_events.append(ev_key)
            new_events = [-1 if x==ev_key else x for x in detected_events]
            df_news['event'] = new_events
    for ev_key in to_remove_events:
        try:
            del events_locations[ev_key]
        except KeyError:
            continue

    events_dates = get_events_dates(df_news, events_locations)
    events_library_v2 = {}
    for ev_key in events_locations.keys():
        events_library_v2[ev_key] = {
            "keywords" : events_library[ev_key],
            "dates": events_dates[ev_key],
            "locations": events_locations[ev_key]
        }

    if event_merging == True:
        print("PROCESS:\t Event Merging...")
        df_news, events_library_v3 = ev_merging(df_news, events_library_v2, events_locations)
        print("RESULT:\t\t Number of events after event merging: ", len(df_news['event'].unique())-1, "events (after event merging)")

    # # Stopping perf counter
    time_stop = time.perf_counter()
    print("== Code finished in {} second(s)".format(str(time_stop-time_start)), "==")

    if event_merging == True:
        return df_news, pd.DataFrame.from_dict(events_library_v3, orient='index')
    else:
        return df_news, pd.DataFrame.from_dict(events_library_v2, orient='index')

## Run

In [24]:
event_merging_pairs = []
enhanced_df, enhanced_library = main_enhanced(path=PATH_DATASET_AYLIEN,
                                            threshold=0.2,
                                            attenuation=0.1,
                                            query='hailstorm',
                                            keywords_analysis=True,
                                            semantic_similarity=True,
                                            event_merging=True)

PROCESS:	 Reading dataset...


ValueError: ignored

In [28]:
!pip freeze > requirements.txt

# Evaluation

## Metric

In [None]:
def evaluations(news_bodies, event_labels):
    tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing_text)
    tfidf = tfidf_vectorizer.fit_transform(news_bodies)

    print("Silhouette \t\t: ", silhouette_score(tfidf.toarray(), event_labels, metric='euclidean'), "\t (Best is 1, Worst is -1, Near 0 means overlapping clusters)")
    print("Calinski Harabasz \t: ", calinski_harabasz_score(tfidf.toarray(), event_labels), "\t (Higher is better) ")

In [None]:
# With Keyword Analysis (TFIDF) and Event Merging
evaluations(enhanced_df['body'], enhanced_df['event'])

Silhouette 		:  0.10820777407007456 	 (Best is 1, Worst is -1, Near 0 means overlapping clusters)
Calinski Harabasz 	:  2.2303812193838257 		 (Higher is better) 


## Time-Location

In [27]:
eval_hailstorm = pd.read_csv(PATH_EVAL_HAILSTORM, skipinitialspace=True, parse_dates=['time'])
eval_hailstorm['time'] = pd.to_datetime(eval_hailstorm['time'], format='%Y-%m-%d')
eval_hailstorm.head()

FileNotFoundError: ignored

In [None]:
detected = len(enhanced_library)
ground = len(eval_hailstorm)
intersect = 0

for lib_idx, lib_df in enhanced_library.iterrows():
    for gnd_idx, gnd_df in eval_hailstorm.iterrows():
        loc_gnd = gnd_df['location']

        is_location_true = False
        is_date_true = False
        for loc_df in lib_df['locations']:
            if loc_gnd == loc_df['city']:
                is_location_true = True 
                break
        
        for date_df in lib_df['dates']:
            if days_between(date_df[0], gnd_df['time'].date()) <= 3:
                is_date_true = True

        if is_location_true and is_date_true: # ( days_between(lib_df['highest_freq_date'], gnd_df['time'].date()) <= 3):
            intersect = intersect + 1
            print(lib_idx, loc_df, ":", lib_df['dates'], " - ", gnd_df['time'])

0 {'city': 'Potohar', 'country': 'Pakistan', 'lat': 33.6195827, 'lng': 73.0657654, 'count': 4} : [(datetime.date(2019, 5, 5), 1)]  -  2019-05-05 00:00:00
1 {'city': 'Las Vegas', 'country': 'Honduras', 'lat': 15.01667, 'lng': -87.45, 'count': 2} : [(datetime.date(2019, 5, 8), 1)]  -  2019-05-08 00:00:00
3 {'city': 'Sindh', 'country': 'Pakistan', 'lat': 25.5, 'lng': 69.0, 'count': 2} : [(datetime.date(2019, 5, 17), 1)]  -  2019-05-17 00:00:00
4 {'city': 'Jaisalmer', 'country': 'India', 'lat': 26.91763, 'lng': 70.90387, 'count': 4} : [(datetime.date(2019, 5, 17), 1)]  -  2019-05-17 00:00:00
5 {'city': 'Vermilion', 'country': 'Canada', 'lat': 53.35409, 'lng': -110.85849, 'count': 1} : [(datetime.date(2019, 5, 17), 1)]  -  2019-05-16 00:00:00
6 {'city': 'Delhi', 'country': 'India', 'lat': 28.65195, 'lng': 77.23149, 'count': 5} : [(datetime.date(2019, 5, 17), 1)]  -  2019-05-17 00:00:00
12 {'city': 'Abbottabad', 'country': 'Pakistan', 'lat': 34.1463, 'lng': 73.21168, 'count': 3} : [(datetime

In [None]:
print("ground \t\t: ", ground)
print("detected \t: ", detected)
print("intersect \t: ", intersect)

precision = intersect/detected
recall = intersect/ground
fscore = 2*( (precision*recall)/(precision+recall) )

print("precision \t: ", precision)
print("recall \t\t: ", recall)
print("F-Score \t: ", fscore)

ground 		:  61
detected 	:  100
intersect 	:  52
precision 	:  0.52
recall 		:  0.8524590163934426
F-Score 	:  0.6459627329192545


## Event Merging

In [None]:
sorted(event_merging_pairs, key=lambda x: x['distance']['location'], reverse=True)

[{'distance': {'date': 0, 'location': 49.62682665083389},
  'loc1': {'date': datetime.date(2019, 5, 25),
   'lat': 33.72148,
   'lng': 73.04329},
  'loc2': {'date': datetime.date(2019, 5, 25),
   'lat': 34.1463,
   'lng': 73.21168}},
 {'distance': {'date': 0, 'location': 49.62682665083389},
  'loc1': {'date': datetime.date(2019, 5, 25),
   'lat': 34.1463,
   'lng': 73.21168},
  'loc2': {'date': datetime.date(2019, 5, 25),
   'lat': 33.72148,
   'lng': 73.04329}},
 {'distance': {'date': 2, 'location': 20.403404836793243},
  'loc1': {'date': datetime.date(2019, 11, 15),
   'lat': 23.3880846,
   'lng': 70.173166},
  'loc2': {'date': datetime.date(2019, 11, 17),
   'lat': 23.3040626,
   'lng': 70.3507333}},
 {'distance': {'date': 2, 'location': 20.403404836793243},
  'loc1': {'date': datetime.date(2019, 11, 17),
   'lat': 23.3040626,
   'lng': 70.3507333},
  'loc2': {'date': datetime.date(2019, 11, 15),
   'lat': 23.3880846,
   'lng': 70.173166}},
 {'distance': {'date': 2, 'location': 0.0}