In [2]:
import pandas as pd
import geopandas as gpd
import json
from ast import literal_eval
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# read in the dataset 
df = pd.read_json('animal_news_database.json', lines = True)
df

Unnamed: 0,_id,common_name,scientific_name,distribution,iucn_status,articles,date_saved
0,{'$oid': '5eb530eeccc46a6a739577a2'},Campbell's alligator lizard,Abronia campbelli,Guatemala,Critically Endangered,{'0': {'text': 'A 3-foot-long (1 meter) green ...,2020-06-18
1,{'$oid': '5eb53116ccc46a6a739577a6'},Southern plains gray langur,Semnopithecus dussumieri,India,Least Concern,"{'3': {'text': 'On Thursday, a Nilgai was foun...",2020-06-18
2,{'$oid': '5eb53156ccc46a6a739577ab'},Natal giant cycad,Encephalartos natalensis,South Africa,Near Threatened,{'0': {'text': 'A GUIDED walk entitled Interes...,2020-06-18
3,{'$oid': '5eb53162ccc46a6a739577ad'},Bengal hanuman langur,Semnopithecus entellus,"Bangladesh, India, Nepal, Pakistan",Least Concern,{'1': {'text': 'Advertisement From the tropic...,2020-06-18
4,{'$oid': '5eb53171ccc46a6a739577af'},Mongolian Wild Ass,Equus hemionus hemionus,"China, Kazakhstan, Mongolia, Russian Federation",,{'0': {'text': 'Exploding demand for cashmere ...,2020-06-18
...,...,...,...,...,...,...,...
2272,{'$oid': '5ed349928089be8ca72beb86'},Chimpanzee,Pan troglodytes,"Angola, Benin, Burkina Faso, Burundi, Cameroon...",EN,{'0': {'text': 'The Chimpanzee Sanctuary & Wil...,2020-06-18
2273,{'$oid': '5ed34a668089be8ca72beb87'},Common chimpanzee,Pan troglodytes,"Angola, Benin, Burkina Faso, Burundi, Cameroon...",EN,{},2020-06-18
2274,{'$oid': '5ed352f08089be8ca72beb8a'},@TRAFFIC_WLTrade,@TRAFFIC_WLTrade,,,{'0': {'text': 'Pangolins are trafficked for t...,2020-06-18
2275,{'$oid': '5ed355ec8089be8ca72beb8b'},@IlWildTrade,@IlWildTrade,,,"{'0': {'text': 'india Updated: May 03, 2016 1...",2020-06-18


In [4]:
# Drop Twitter handles as they are not species names
df = df[df.scientific_name.str.contains('@') == False]
df.reset_index(drop = True, inplace = True)

In [5]:
# Add new column for storing the values article amount found under each species type 

for i, row in df.iterrows():
    df.at[i, 'article_amt'] = len(row.articles.values())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.at[i, 'article_amt'] = len(row.articles.values())


In [10]:
# Two lists that contain the 5 charismatic species and 5 EDGE species
charismatic = ["Panthera tigris", "Loxodonta africana", "Ailuropoda melanoleuca", "Gavialis gangeticus", "Platanista gangetica"]
edge = ["Andrias davidianus", "Shinisaurus crocodilurus", "Houbaropsis bengalensis", "Manis pentadactyla", "Tapirus baridii"]


In [21]:
def list_to_df(lst, df):
    ''''''
    ## Create dfs out of list 
    ''''''
    # creating a blank pandas df
    blank = pd.DataFrame()
    for spec in lst:
        new = df.loc[df['scientific_name'].isin([spec])]
        blank = blank.append(new)
        
    
    # Group dataframe by the scientific name that remains consistent for all rows 
    grouped = blank.groupby('scientific_name')

    # Calculate the totals of the group species
    article_totals = grouped.article_amt.sum().round()
    
    # create a dataframe of the articles found under each species instance 
    # How to bring over the species names that have been "identified" for each article? **

    media = pd.DataFrame()

    for i in blank.articles:
        for x in i.values():
            nd = pd.DataFrame.from_dict(x)
            media = media.append(nd)
        
        
    # keep only unique article instances
    media = media.drop_duplicates(subset=['text'])
    
    return media

In [23]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [38]:
media = list_to_df(edge, df)


In [42]:
def categorise(row):  
    if 'pangolin' in row["text"]:
        return 'Pangolin'
    elif 'salamander' in row["text"]:
        return 'Chinese Giant Salamander'
    elif 'florican' in row["text"]:
        return 'Bengal Florican'
    return 'Chinese Crocodile Lizard'


media['species'] = media.apply(lambda row: categorise(row), axis=1)

In [44]:
# Transform date column into DateTime
media["date"] = pd.to_datetime(media["date"])
# Sort by oldest to newest
media = media.sort_values(by="date")

media.head()

Unnamed: 0,text,title,link,image,date,ner,species
0,"By Richard Black\n\nEnvironment Correspondent,...",Hunting threat to big amphibians,https://news.google.com/articles/CBMiNWh0dHA6L...,http://newsimg.bbc.co.uk/media/images/40816000...,2005-09-19,{'sent': 'By Richard Black Environment Corres...,Chinese Giant Salamander
0,Steven Brooks grew up in the Hudson Valley of ...,"New Yorker looks to buy a lizard, gets whole f...",https://news.google.com/articles/CBMiR2h0dHBzO...,https://www.al.com/resizer/VXCMJHahaq-eEg50p6m...,2010-09-07,{'sent': 'Steven Brooks grew up in the Hudson ...,Chinese Crocodile Lizard
0,Houston Zoo takes action when animals get stre...,Houston Zoo takes action when animals get stre...,https://news.google.com/articles/CBMib2h0dHBzO...,https://s.hdnux.com/photos/04/50/61/1215605/3/...,2011-08-01,{'sent': 'Houston Zoo takes action when animal...,Chinese Crocodile Lizard
0,Kolkata : Fifteen Indian bird species are part...,15 Indian bird species among globally endanger...,https://news.google.com/articles/CBMic2h0dHBzO...,https://images.livemint.com/rf/Image-621x414/L...,2014-04-19,"{'sent': 'Kolkata :', 'ents': {'PERSON': ['Kol...",Chinese Crocodile Lizard
0,Entire branches of the tree of life are in dan...,Threatened Species 'Red List' Warns 90 Percent...,https://news.google.com/articles/CBMieGh0dHBzO...,https://www.nationalgeographic.com/content/dam...,2014-06-13,"{'sent': 'The list, which is managed by the In...",Chinese Crocodile Lizard


In [48]:
# Save articles dataframe
media.to_csv("5edge_articles.csv")

In [26]:
cmedia = list_to_df(charismatic, df)

In [46]:
# Re-assign labels for the dataframe that better communicates what species are mentioned
def categorise(row):  
    if 'tiger' in row["text"]:
        return 'Tiger'
    elif 'elephant' in row["text"]:
        return 'Elephant'
    elif 'panda' in row["text"]:
        return 'Giant White Panda'
    elif 'gharial' in row["text"]:
        return 'Gharial'
    return 'Indus River Dolphin'


cmedia['species'] = cmedia.apply(lambda row: categorise(row), axis=1)

# Transform date column into DateTime
cmedia["date"] = pd.to_datetime(cmedia["date"])
# Sort by oldest to newest
cmedia = cmedia.sort_values(by="date")

cmedia.head()

Unnamed: 0,text,title,link,image,date,ner,species
0,A Gharial crocodile rests besides a pond at hi...,"Indian crocodiles ""video blog"" for survival",https://news.google.com/articles/CBMicWh0dHBzO...,https://s1.reutersmedia.net/resources/r/?m=02&...,2009-11-09,{'sent': 'A Gharial crocodile rests besides a ...,Gharial
0,What is the African elephant?\n\nAfrican eleph...,African elephant,https://news.google.com/articles/CBMiRmh0dHBzO...,https://www.nationalgeographic.com/content/dam...,2010-09-10,"{'sent': 'Currently, most still consider them ...",Elephant
0,"NARORA, India (Reuters Life!) - As the sun set...",Conservation and religion join to save Ganges ...,https://news.google.com/articles/CBMifWh0dHBzO...,https://s4.reutersmedia.net/resources_v2/image...,2010-10-28,"{'sent': 'NARORA, India (', 'ents': {'ORG': ['...",Indus River Dolphin
0,The African elephant is actually two different...,African Elephants Really Two Wildly Different ...,https://news.google.com/articles/CBMiYWh0dHBzO...,https://www.nationalgeographic.com/content/dam...,2010-12-22,{'sent': 'The African elephant is actually two...,Elephant
0,"This bizarre crocodilian from northern India, ...",Where the weird things are: The gharial - a cr...,https://news.google.com/articles/CBMieGh0dHBzO...,https://news.google.com/img/shortcut-icons/fav...,2011-01-29,{'sent': 'This bizarre crocodilian from northe...,Gharial


In [49]:
cmedia.to_csv("5charismatic_articles.csv")