# Random Species Generator

In [1]:
import pandas as pd
import geopandas as gpd
import json
from ast import literal_eval
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import random
#pip install geopandas
#pip install spacy
#python -m spacy download en_core_web_sm

In [None]:
# read in the dataset 
df = pd.read_json('animal_news_database.json', lines = True)

# Read in the datasets defining our subset data
undoc = pd.read_csv("undoc_species.csv", sep = ";")
charismatic = pd.read_csv("charismatic_20.csv", sep = ";")
edge = pd.read_csv("non_charismatic_20.csv", sep = ";")



# Clean up datasets
charismatic = charismatic.rename(columns = {'Scientifc Name' : 'scientific_name'})
charismatic = charismatic[['scientific_name', 'Ecosystem']]
edge = edge.rename(columns = {'Scientifc Name' : 'scientific_name'})
edge = edge[['scientific_name', 'Ecosystem', 'Conservation Attention']]

# Drop Twitter handles as they are not species names
df = df[df.scientific_name.str.contains('@') == False]
df.reset_index(drop = True, inplace = True)
# How many species listed total in this dataset
#df['scientific_name'].nunique()

In [None]:
# Add new column for storing the values article amount found under each species type 
for i, row in df.iterrows():
    df.at[i, 'article_amt'] = len(row.articles.values())
    
# Read in the CITES data to join with the top 10 (or 20) most reported on species 
cites1 = pd.read_csv("cites1.csv", delimiter = ';')

# Keep only most relevant columns
cites1 = cites1[['Kingdom', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Scientific Name', 'Listing', 'NativeDistributionFullNames', 'Extinct_Distribution']]
cites1 = cites1.rename(columns = {'Scientific Name' : 'scientific_name'})

# Do the same for the entire dataframe
df = df.merge(cites1, on = 'scientific_name', how = 'inner')

# Merge similarly with the data subsets
rizz = df.merge(charismatic, on = 'scientific_name', how = 'inner')
dg = df.merge(edge, on = 'scientific_name', how = 'inner')

# Group dataframe by the scientific name that remains consistent for all rows 
grouped = rizz.groupby('scientific_name')# 'Class', 'iucn_status'])
# Calculate the totals of the group species
article_totals_rz = grouped.article_amt.sum().round()

# Group dataframe by the scientific name that remains consistent for all rows 
grouped = dg.groupby('scientific_name')# 'Class', 'iucn_status'])
# Calculate the totals of the group species
article_totals_dg = grouped.article_amt.sum().round()

In [None]:
# This will at least return unique instances. Does not include the Ecosystem requirements I came up with 
rizz.query("Class = Mammal").sample(n=8)

rizz.query("Class = Reptile").sample(n=2)

In [None]:
species_class = ["Mammalia"] * 8 + ["Reptilia"] * 2
ecosystem = ["Terrestrial"] * 6 + ["Aquatic"] * 4
random.shuffle(ecosystem)  # randomly shuffle ecosystem
random.shuffle(species_class)


# POssible solution... results in an Variable errore and doesn't select unique species..
df1_selected = pd.DataFrame(columns=df1.columns)
for i in range(10):
    df1_selected = df1_selected.append(
        df1.query(f"Class = {species_class[i]} and Ecosystem = {ecosystem[i]}").sample(n=1)
    )
    
rizz_selected = pd.DataFrame(columns=rizz.columns)
for i, j in zip(species_class, ecosystem):
    rizz_selected = rizz_selected.append(
        rizz.query(f'`Class` == @i and Ecosystem == @j').sample(n=1)
    )
    
