In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy.stats import norm

In [2]:
critically_endangered = pd.read_csv("critically_endangered_filtered.csv")

# drop empty cells and remove duplicates that appeared after writing the data into a csv in data processing
critically_endangered = critically_endangered.dropna(axis="columns", how="all")
critically_endangered = critically_endangered.dropna(axis="rows", how="all")
critically_endangered = critically_endangered.drop_duplicates(subset='speciesName', keep='first')

### Parse HTML Tags

In [3]:
cols=['Region',
 'scientificName',
 'kingdomName',
 'phylumName',
 'orderName',
 'className',
 'familyName',
 'genusName',
 'speciesName',
 'redlistCriteria',
 'rationale',
 'habitat',
 'threats',
 'population',
 'populationTrend',
 'range',
 'useTrade',
 'systems',
 'conservationActions',
 'realm',
 'yearLastSeen',
 'scopes']

for col in cols:
    for val in critically_endangered[col]:
        if (not (pd.isnull(val))):
            orig_val=val
            if (val.find("<")>-1):
                for i in range(val.count("<")):
                    val=val[:val.find("<")]+val[val.find(">")+1:]
                critically_endangered.at[list(critically_endangered[col]).index(orig_val),col]=val

We filtered out all of the HTML tags in every cell in certain columns.

### Red List Criteria Indicator Variables

In [4]:
# define helper function to create indicator variables for Red List criteria
# returns a dictionary of criterias as keys and series of indicators as values
def redlistcrit_helper():

    A = critically_endangered['redlistCriteria'].str.contains("A")
    A=A.astype(int)

    B = critically_endangered['redlistCriteria'].str.contains("B")
    B=B.astype(int)

    C = critically_endangered['redlistCriteria'].str.contains("C")
    C=C.astype(int)

    D = critically_endangered['redlistCriteria'].str.contains("D")
    D=D.astype(int)

    E = critically_endangered['redlistCriteria'].str.contains("E")
    E=E.astype(int)

    indicator_redlistCrit = { 'A': A, 'B': B, 'C': C, 'D': D, 'E':E } 

    return indicator_redlistCrit

In [5]:
# display indicator variables for criteria
redlistCriteria_dummies = pd.DataFrame(redlistcrit_helper()) 
redlistCriteria_dummies.head()

Unnamed: 0,A,B,C,D,E
0,0,1,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,0,1,0,0,0


In [6]:
for criteria in ['A','B','C','D','E']:
    critically_endangered[criteria] = redlistCriteria_dummies[criteria]

We created indicator variables for the Red List criteria categories and a helper function to help us get these indicator variables. There were many different unique values in the criteria column initially because species could have multiple criterias. We assigned a 1 at anytime a species was in a criteria category. We added these columns to the end of our dataframe.

### Region Indicator Variables

In [7]:
# display indicator variables for regions
region_dummies = pd.get_dummies(critically_endangered['Region'])
region_dummies.head()

Unnamed: 0,North Africa,North America,North Asia,South America
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [8]:
for region in ['North Africa', 'North America', 'North Asia', 'South America']:
    critically_endangered[region] = region_dummies[region]

We created indicator variables for the regions and added these columns to the end of our dataframe.

### Systems Indicator Variables

In [9]:
# define helper function to create indicator variables for systems
# returns a dictionary of systems as keys and series of indicators as values
def systems_helper():

    terrestrial= critically_endangered['systems'].str.contains("Terrestrial")
    terrestrial=terrestrial.astype(int)

    marine = critically_endangered['systems'].str.contains('Marine')
    marine=marine.astype(int)

    freshwater = critically_endangered['systems'].str.contains('Freshwater \(=Inland waters\)')
    freshwater=freshwater.astype(int)
    
    indicator_systems = { 'Terrestrial': terrestrial, 'Marine': marine, 'Freshwater': freshwater} 

    return indicator_systems

In [10]:
systems_dummies = pd.DataFrame(systems_helper())
systems_dummies.head()

Unnamed: 0,Terrestrial,Marine,Freshwater
0,1,0,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,0,1


In [11]:
for system in ['Terrestrial', 'Marine', 'Freshwater']:
    critically_endangered[system] = systems_dummies[system]

We created indicator variables using a helper function to filter through the systems column. Initially, there were 7 different systems in the column, so we separated these 7 into 3 specific categories for systems and added these columns to the end of our dataframe.

### New CSV

In [12]:
critically_endangered.to_csv('/Users/sawyer/Desktop/critically_endangered_processed.csv')

We finally wrote the processed dataframe into a new CSV file that will be read in the Final Report.