In [1]:
import numpy as np
import pandas as pd
import altair as alt
from vega_datasets import data

# Data Cleaning

In [2]:
dinos = pd.read_csv("dinosaurs.csv").dropna(subset=['lived_in']).drop(columns="named_by")
dinos.head()

Unnamed: 0,name,diet,period,lived_in,type,length,taxonomy,species,link
0,aardonyx,herbivorous,Early Jurassic 199-189 million years ago,South Africa,sauropod,8.0m,Dinosauria Saurischia Sauropodomorpha Prosauro...,celestae,https://www.nhm.ac.uk/discover/dino-directory/...
1,abelisaurus,carnivorous,Late Cretaceous 74-70 million years ago,Argentina,large theropod,9.0m,Dinosauria Saurischia Theropoda Neotheropoda C...,comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...
2,achelousaurus,herbivorous,Late Cretaceous 83-70 million years ago,USA,ceratopsian,6.0m,Dinosauria Ornithischia Genasauria Cerapoda Ma...,horneri,https://www.nhm.ac.uk/discover/dino-directory/...
3,achillobator,carnivorous,Late Cretaceous 99-84 million years ago,Mongolia,large theropod,5.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,giganteus,https://www.nhm.ac.uk/discover/dino-directory/...
4,acrocanthosaurus,carnivorous,Early Cretaceous 115-105 million years ago,USA,large theropod,12.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,atokensis,https://www.nhm.ac.uk/discover/dino-directory/...


In [3]:
continents = pd.read_csv("countries_by_continents.csv")
continents.head()

Unnamed: 0,Continent,Country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina


In [4]:
codes = pd.read_csv("country_codes.csv")[["name", "region", "sub-region", "country-code"]]
codes = codes.replace("Northern Africa", "North Africa")
codes = codes.replace("Northern Africa", "North Africa")
codes.head()

Unnamed: 0,name,region,sub-region,country-code
0,Afghanistan,Asia,Southern Asia,4
1,Åland Islands,Europe,Northern Europe,248
2,Albania,Europe,Southern Europe,8
3,Algeria,Africa,North Africa,12
4,American Samoa,Oceania,Polynesia,16


In [5]:
countries = pd.read_csv("country_codes.csv")[["name", "region", "sub-region", "country-code"]].rename(columns={"name":"country"})
countries.head()

Unnamed: 0,country,region,sub-region,country-code
0,Afghanistan,Asia,Southern Asia,4
1,Åland Islands,Europe,Northern Europe,248
2,Albania,Europe,Southern Europe,8
3,Algeria,Africa,Northern Africa,12
4,American Samoa,Oceania,Polynesia,16


In [6]:
# Gets Continent
dinos["lived_in"] = dinos["lived_in"].replace("USA", "United States")
dinos = dinos.merge(continents, how="left", left_on="lived_in", right_on="Country").drop(columns=["Country"])
dinos.head()

Unnamed: 0,name,diet,period,lived_in,type,length,taxonomy,species,link,Continent
0,aardonyx,herbivorous,Early Jurassic 199-189 million years ago,South Africa,sauropod,8.0m,Dinosauria Saurischia Sauropodomorpha Prosauro...,celestae,https://www.nhm.ac.uk/discover/dino-directory/...,Africa
1,abelisaurus,carnivorous,Late Cretaceous 74-70 million years ago,Argentina,large theropod,9.0m,Dinosauria Saurischia Theropoda Neotheropoda C...,comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...,South America
2,achelousaurus,herbivorous,Late Cretaceous 83-70 million years ago,United States,ceratopsian,6.0m,Dinosauria Ornithischia Genasauria Cerapoda Ma...,horneri,https://www.nhm.ac.uk/discover/dino-directory/...,North America
3,achillobator,carnivorous,Late Cretaceous 99-84 million years ago,Mongolia,large theropod,5.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,giganteus,https://www.nhm.ac.uk/discover/dino-directory/...,Asia
4,acrocanthosaurus,carnivorous,Early Cretaceous 115-105 million years ago,United States,large theropod,12.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,atokensis,https://www.nhm.ac.uk/discover/dino-directory/...,North America


In [7]:
# Fixes nan values in Continent
dinos.loc[dinos["lived_in"] == "Antarctica", "Continent"] = "Antarctica"
dinos.loc[dinos["lived_in"] == "North Africa", "Continent"] = "Africa"
dinos.loc[dinos["lived_in"] == "Wales", "Continent"] = "Europe"

In [8]:
# Gets Periods
def getPeriod(period):
    return " ".join(period.split()[:2])

dinos["period"] = dinos["period"].apply(getPeriod)

In [9]:
def getOnlyPeriod(period):
    return period.split()[1]

def getPeriodTime(period):
    return period.split()[0]

dinos["period_time"] = dinos["period"].apply(getPeriodTime)
dinos["period"] = dinos["period"].apply(getOnlyPeriod)

In [10]:
# Reorder and rename columns
dinos = dinos.rename(columns={"Continent": "continent"})
dinos = dinos[['name', 'type', 'species', 'period', 'period_time', 'continent', 'lived_in', 'diet', 'length', 'taxonomy', 'link']]
dinos.head(3)

Unnamed: 0,name,type,species,period,period_time,continent,lived_in,diet,length,taxonomy,link
0,aardonyx,sauropod,celestae,Jurassic,Early,Africa,South Africa,herbivorous,8.0m,Dinosauria Saurischia Sauropodomorpha Prosauro...,https://www.nhm.ac.uk/discover/dino-directory/...
1,abelisaurus,large theropod,comahuensis,Cretaceous,Late,South America,Argentina,carnivorous,9.0m,Dinosauria Saurischia Theropoda Neotheropoda C...,https://www.nhm.ac.uk/discover/dino-directory/...
2,achelousaurus,ceratopsian,horneri,Cretaceous,Late,North America,United States,herbivorous,6.0m,Dinosauria Ornithischia Genasauria Cerapoda Ma...,https://www.nhm.ac.uk/discover/dino-directory/...


# EDA

In [11]:
# Included Periods
periods = set([getPeriod(x) for x in dinos["period"].values])
periods

{'Cretaceous', 'Jurassic', 'Triassic'}

In [12]:
# Included Continents
Continent = set(dinos["continent"])
Continent

{'Africa',
 'Antarctica',
 'Asia',
 'Europe',
 'North America',
 'Oceania',
 'South America'}

# DinoDex

In [13]:
# Gets Counts
dino_data = dinos.copy()
dino_data = dino_data.merge(countries, left_on="lived_in", right_on="country", how="left")
dino_data.loc[dino_data["lived_in"] == "Wales", "country-code"] = 826
dino_data.loc[dino_data["lived_in"] == "Wales", "country"] = "Wales"

africa = countries[countries["region"] == "Africa"]["country"].value_counts()
dino_counts = dino_data.groupby("country")["name"].count()
dino_counts = (pd.DataFrame(pd.concat([dino_counts, africa], axis=0))
               .reset_index()
               .rename(columns={"index":"name"})
               .groupby("name")
               .sum()
               .rename(columns={0:"count"})
               .reset_index())
dino_counts.head()

Unnamed: 0,name,count
0,Algeria,1
1,Angola,1
2,Antarctica,1
3,Argentina,26
4,Australia,6


In [14]:
# Gets Codes
dino_data = dino_counts.merge(codes, left_on="name", right_on="name", how="right")
dino_data["count"] = dino_data["count"].fillna(0)
dino_data.head()

Unnamed: 0,name,count,region,sub-region,country-code
0,Afghanistan,0.0,Asia,Southern Asia,4
1,Åland Islands,0.0,Europe,Northern Europe,248
2,Albania,0.0,Europe,Southern Europe,8
3,Algeria,1.0,Africa,North Africa,12
4,American Samoa,0.0,Oceania,Polynesia,16


In [15]:
alt.renderers.set_embed_options(actions=False)

RendererRegistry.enable('default')

In [16]:
def getCode(data):
    '''
    Adds country codes to data
    
    '''
    countries = pd.read_csv("country_codes.csv")[["name", "region", "sub-region", "country-code"]].rename(columns={"name":"country"})

    # Cleans dino data
    dino_data = data.merge(countries, left_on="lived_in", right_on="country", how="left")
    dino_data.loc[dino_data["lived_in"] == "Wales", "country-code"] = 826
    dino_data.loc[dino_data["lived_in"] == "Wales", "country"] = "Wales"
    africa = countries[countries["region"] == "Africa"]["country"].value_counts()
    dino_counts = dino_data.groupby("country")["name"].count()
    dino_counts = (pd.DataFrame(pd.concat([dino_counts, africa], axis=0))
                   .reset_index()
                   .rename(columns={"index":"name"})
                   .groupby("name")
                   .sum()
                   .rename(columns={0:"count"})
                   .reset_index())
    
    # Adds codes
    codes = pd.read_csv("country_codes.csv")[["name", "region", "sub-region", "country-code"]]
    codes = codes.replace("Northern Africa", "North Africa")
    codes = codes.replace("Northern Africa", "North Africa")

    dino_data = dino_counts.merge(codes, left_on="name", right_on="name", how="right")
    dino_data["count"] = dino_data["count"].fillna(0)
    return dino_data

def getChart(dataset):
    '''
    Visualizes the location of dinosaurs
    
    '''

    dino_data = getCode(dataset)
    source = alt.topo_feature(data.world_110m.url, "countries")

    # Adds ocean background
    ocean = alt.sphere()
    ocean = alt.Chart(ocean).mark_geoshape(fill='lightblue')

    # Adds country background
    background = alt.Chart(source).mark_geoshape(
        fill="#F5CF65",
        stroke='gray'
    ).properties(width=700, height=400)

    # Adds country colors based on dinosaur count
    selection = alt.selection_point(fields=['diet'], bind='legend')
    foreground = (
        alt.Chart(source)
        .mark_geoshape(stroke="black", strokeWidth=0.3)
        .encode(
            color=alt.Color(
                "count:Q", 
                scale=alt.Scale(scheme="goldorange"), 
                legend=alt.Legend(
                    orient='none',
                    title="Count of Dinosaurs",
                    legendX=200, 
                    legendY=400,
                    direction='horizontal',
                    titleAnchor='middle',
                    gradientLength=300)
                ),
            tooltip=[
                alt.Tooltip("name:N", title="Country"),
                alt.Tooltip("count:Q", title="Count of Dinosaurs"),
            ],
        )
        .transform_lookup(
            lookup="id",
            from_=alt.LookupData(dino_data, "country-code", ["count", "name"]),
        )
    )

    # Puts map layers together
    final_map = (
        (ocean + background + foreground)
        .configure_view(strokeWidth=0)
        .properties(width=700, height=400, background='transparent')
        .project("naturalEarth1")
    )
    return final_map

getChart(dinos)