In [1]:
#!pip install wing
#!pip install umap-learn
#!pip install hdbscan
# !pip install plotly_express

In [2]:
import pandas as pd, numpy as np
import wing
from wing.pd_utils import da
from wing.preprocess import numericalize, train_cats, add_datepart, proc_df
from sklearn.preprocessing import scale
import umap
import plotly_express
import hdbscan
%matplotlib inline
path_data = "./data/"

## Preprocessing 

In [12]:
df = pd.read_csv(path_data + 'sipri-report-explosions.csv')
df.date = list(map(lambda x: ('19' + str(x)), df.date))
df.date = pd.to_datetime(df.date, format='%Y%m%d')
add_datepart(df, 'date', drop=True)

In [13]:
# Replace missing values with 0 for yield columns 
df.yield_1[df.yield_1.isna()] = df.yield_1.mean()
df.yield_u[df.yield_u.isna()] = df.yield_u.mean()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [14]:
df['hour'] = list(map(lambda x: str(x)[0:2],df.origin_t))
df['hour'] = list(map(lambda x: '0' if x.find('.') != -1  else x, df['hour']))
df['hour'] = list(map(lambda x: str(x[0]) if int(x) >= 24 else x, df['hour']))

In [15]:
atmospheric_c = ['ATMOSPH','AIRDROP','TOWER','BALLOON','SURFACE','BARGE','ROCKET',
                'CRATER', 'UW', 'SPACE', 'WATER SU','SHIP','WATERSUR']
underground_c = ['SHAFT', 'TUNNEL', 'SHAFT/GR', 'SHAFT/LG','UG','GALLERY','MINE']
def preprocess_type(x): 
    if   x in atmospheric_c: return 'ATMOSPHERIC'
    elif x in underground_c: return 'UNDERGROUND'
    else:                    return 'uncategorized'
df['type_cat'] = list(map(preprocess_type, df.type))

In [16]:
# indicator variables 
df['mb_I'] = list(map(lambda x: 1 if x > 0 else 0, df['mb']))
df['Ms_I'] = list(map(lambda x: 1 if x > 0 else 0, df['Ms']))

In [17]:
key = 'id_no'
df.set_index(key, inplace=True)
df1=df.copy()

In [18]:
train_cats(df)  # str --> category 
cols_to_numericalize = ['country', 'region', 'source', 'purpose', 'type_cat']
for c in cols_to_numericalize: 
    numericalize(df, df[c], c + "_code", None)
dropcols = cols_to_numericalize + \
    ['name', 'date_long', 'origin_t', '_Year', '_Is_month_end', '_Is_month_start', 
    '_Is_year_end', '_Is_year_start','_Is_quarter_end', '_Is_quarter_start', '_Elapsed']
df.drop(columns = dropcols, inplace=True)

In [19]:
# scientific viewpoint 
keepcols = ['mb', 'Ms', 'depth', 'yield_1', 'yield_u', 'year', 'country_code', 'type_cat_code']
df_env = df[keepcols]
df_env_s =  (df_env-df_env.mean())/df_env.std()

## Clustering 

In [24]:
fit = umap.UMAP(n_neighbors=100,min_dist=0)
%time u = fit.fit_transform(df_env_s)

CPU times: user 9.44 s, sys: 152 ms, total: 9.59 s
Wall time: 9.26 s


In [25]:
df1['x'],df1['y'] = u[:,0],u[:,1]

In [26]:
df1.columns

Index(['origin_t', 'country', 'region', 'source', 'latitude', 'longitude',
       'mb', 'Ms', 'depth', 'yield_1', 'yield_u', 'purpose', 'name', 'type',
       'date_long', 'year', '_Year', '_Month', '_Week', '_Day', '_Dayofweek',
       '_Dayofyear', '_Quarter', '_Is_month_end', '_Is_month_start',
       '_Is_quarter_end', '_Is_quarter_start', '_Is_year_end',
       '_Is_year_start', '_Elapsed', 'hour', 'type_cat', 'mb_I', 'Ms_I', 'x',
       'y'],
      dtype='object')

In [33]:
plotly_express.scatter(df1,x='x', y='y', color='region', 
                       hover_data=['year','country','type','type_cat','region','year'])

In [32]:
df1

Unnamed: 0_level_0,origin_t,country,region,source,latitude,longitude,mb,Ms,depth,yield_1,...,_Is_quarter_start,_Is_year_end,_Is_year_start,_Elapsed,hour,type_cat,mb_I,Ms_I,x,y
id_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45001,123000.0,USA,ALAMOGORDO,DOE,32.540,-105.570,0.0,0.0,-0.100,21.0,...,False,False,False,-771984000,12,ATMOSPHERIC,0,0,-6.857891,-16.797060
45002,231500.0,USA,HIROSHIMA,DOE,34.230,132.270,0.0,0.0,-0.600,15.0,...,False,False,False,-770256000,23,ATMOSPHERIC,0,0,-6.861694,-16.792982
45003,15800.0,USA,NAGASAKI,DOE,32.450,129.520,0.0,0.0,-0.600,21.0,...,False,False,False,-769910400,15,ATMOSPHERIC,0,0,-6.826782,-16.810555
46001,220100.0,USA,BIKINI,DOE,11.350,165.200,0.0,0.0,-0.200,21.0,...,False,False,False,-741830400,22,ATMOSPHERIC,0,0,-6.873917,-16.814632
46002,213500.0,USA,BIKINI,DOE,11.350,165.200,0.0,0.0,0.030,21.0,...,False,False,False,-739756800,21,ATMOSPHERIC,0,0,-6.816502,-16.842319
48001,181700.0,USA,ENEWETAK,DOE,11.300,162.150,0.0,0.0,-0.080,37.0,...,False,False,False,-685324800,18,ATMOSPHERIC,0,0,-6.895119,-16.823687
48002,180900.0,USA,ENEWETAK,DOE,11.300,162.150,0.0,0.0,-0.080,49.0,...,False,False,False,-683942400,18,ATMOSPHERIC,0,0,-6.892535,-16.822613
48003,180400.0,USA,ENEWETAK,DOE,11.300,162.150,0.0,0.0,-0.080,18.0,...,False,False,False,-682732800,18,ATMOSPHERIC,0,0,-6.792674,-16.851717
49001,12000.0,USSR,SEMI KAZAKH,DOE,48.000,76.000,0.0,0.0,0.000,22.0,...,False,False,False,-641952000,12,ATMOSPHERIC,0,0,-4.099156,-11.424265
51001,134500.0,USA,NTS,DOE,37.000,-116.000,0.0,0.0,-0.350,1.0,...,False,False,False,-597369600,13,ATMOSPHERIC,0,0,-6.890990,-16.823130
