In [30]:
!pip install -U tables scikit-learn gensim

Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: tables in /home/users/sljack/.local/lib/python3.7/site-packages (3.6.1)
Requirement already up-to-date: scikit-learn in /home/users/sljack/.local/lib/python3.7/site-packages (0.23.1)
Requirement already up-to-date: gensim in /home/users/sljack/.local/lib/python3.7/site-packages (3.8.3)


In [1]:
import requests
import bs4
import pandas as pd
import numpy as np
import gensim

from sklearn.feature_extraction.text import CountVectorizer

# Prepare Vocabulary Documents

We're going to pull some climate vocabularies from the web which can be used to give context to the meta data from the CEDA archive

## Climate & Forecast Metadata Conventions

The first set of vocabulary we're going to look at is the [CF conventions](https://en.wikipedia.org/wiki/Climate_and_Forecast_Metadata_Conventions#:~:text=The%20Climate%20and%20Forecast%20(CF,file%20%22self%2Ddescribing%22.) and are used to define variables found in NetCDF files. The CF conventions have the benefit that they come with a nice human readbable description alongside the variables. 

The first thing we need to do is ge the CF conventions from the web:

In [2]:
response = requests.get('https://cfconventions.org/Data/cf-standard-names/73/src/cf-standard-name-table.xml')
cf_xml = response.text

We can use the BeautifulSoup library to parse the XML file and give us an object we can query. Here we output just the first entry in the CF conventions

In [3]:
cf_soup = bs4.BeautifulSoup(cf_xml, 'lxml')
tag = cf_soup.find('entry')
tag

<entry id="acoustic_signal_roundtrip_travel_time_in_sea_water">
<canonical_units>s</canonical_units>
<grib></grib>
<amip></amip>
<description>The quantity with standard name acoustic_signal_roundtrip_travel_time_in_sea_water is the time taken for an acoustic signal to propagate from the emitting instrument to a reflecting surface and back again to the instrument. In the case of an instrument based on the sea floor and measuring the roundtrip time to the sea surface, the data are commonly used as a measure of ocean heat content.</description>
</entry>

Convert the CF entries into a pandas DataFrame where they are indexed by the CF variable name ID

In [6]:
cf_entries = cf_soup.findAll('entry')
cf_data = [dict(id=entry.get('id'), description=entry.find('description').text, unit=entry.find('canonical_units').text) for entry in cf_entries]
cf_df = pd.DataFrame(cf_data)
# cf_df = cf_df.loc[cf_df.description.map(len) > 0]
cf_df.to_hdf('data/vocabs/cf_conventions.h5', key='data')
cf_df.head(20)

Unnamed: 0,description,id,unit
0,The quantity with standard name acoustic_signa...,acoustic_signal_roundtrip_travel_time_in_sea_w...,s
1,The diameter of a spherical particle with dens...,aerodynamic_particle_diameter,m
2,"The ""aerodynamic_resistance"" is the resistance...",aerodynamic_resistance,m-1 s
3,"""Age of sea ice"" means the length of time elap...",age_of_sea_ice,year
4,"""Age of stratospheric air"" means an estimate o...",age_of_stratospheric_air,s
5,"""Age of surface snow"" means the length of time...",age_of_surface_snow,day
6,This flag is an algorithmic combination of the...,aggregate_quality_flag,1
7,,air_density,kg m-3
8,"The ""equivalent potential temperature"" is a th...",air_equivalent_potential_temperature,K
9,The equivalent temperature is the temperature ...,air_equivalent_temperature,K


Convert the vocaulary from the CF conventions to a document matrix, where the document name is the id of the CF convention and the document vector is defined by terms from the description of the CF entry.

In [42]:
vectorizer = CountVectorizer(token_pattern=r'[a-zA-Z0-9]{3,}', ngram_range=(1, 1), 
                             min_df=0., max_df=.98, stop_words='english', lowercase=True)
tokenizer = vectorizer.build_analyzer()
tokens = cf_df.description.map(tokenizer)

id2word = gensim.corpora.Dictionary(tokens)
bow = [id2word.doc2bow(text) for text in tokens]

cf_embedding = gensim.matutils.corpus2csc(bow).T
cf_embedding = np.array(cf_embedding.todense())

columns = [id2word[x] for x in id2word]
cf_embedding = pd.DataFrame(cf_embedding, index=cf_df.id, columns=columns)
cf_embedding.to_hdf('data/embeddings/cf_embedding.h5', key='data')
cf_embedding.head()

Unnamed: 0_level_0,acoustic,based,case,commonly,content,data,emitting,floor,heat,instrument,...,flowing,bulb,brief,gust,gustiness,parametrised,sudden,timeseries,keeping,presumably
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acoustic_signal_roundtrip_travel_time_in_sea_water,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aerodynamic_particle_diameter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aerodynamic_resistance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
age_of_sea_ice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
age_of_stratospheric_air,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
