## Installs.

In [5]:
# installs.
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 18.3MB/s eta 0:00:01[K     |████████▏                       | 20kB 1.3MB/s eta 0:00:01[K     |████████████▏                   | 30kB 1.5MB/s eta 0:00:01[K     |████████████████▎               | 40kB 1.8MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 1.5MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 1.6MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 1.8MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 1.7MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


## Imports.

In [0]:
# imports.
import pandas as pd
import numpy as np
import requests
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [0]:
# read in the cleaned original data.
df_tr = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/WorldBank_1990_2018.csv')

## Train Data.

In [0]:
# Set train data.
train = df_tr.copy()
train = train.drop(columns=['Unnamed: 0'])
train.head()

Unnamed: 0,Country Code,Year,Agricultural land (sq. km),GDP per capita growth (annual %),Livestock production index (2004-2006 = 100),Urban population,Crop production index (2004-2006 = 100),Food production index (2004-2006 = 100),Ores and metals exports (% of merchandise exports),Electric power consumption (kWh per capita),Forest area (% of land area)
0,ABW,1990,20.0,2.09,67.49,31273.0,71.69,69.36,1.1,1237.52,2.33
1,AFG,1990,380400.0,1.32,70.69,2628554.0,66.64,68.12,2.75,1237.52,2.07
2,AGO,1990,574040.0,-6.66,70.11,4400964.0,29.25,37.91,6.22,53.17,48.91
3,ALB,1990,11210.0,-11.19,57.97,1197222.0,84.36,68.73,2.75,552.25,28.79
4,AND,1990,230.0,-0.14,67.49,51627.0,71.69,69.36,2.75,1237.52,34.04


### Train/Val Split.

In [0]:
# Set variables.
features = train.columns[:-1].tolist()
target = 'Forest area (% of land area)'

X = train.drop(columns=target)
y = train[target]

In [0]:
# Use train/val split on the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((4899, 10), (4899,), (1225, 10), (1225,), (1532, 10), (1532,))

### Random Forest Pipeline.

In [0]:
# random forest pipeline.
pipeline = make_pipeline(
    ce.OneHotEncoder(), 
    RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
)
pipeline.fit(X_train, y_train)

print ('Training Accuracy', pipeline.score(X_train, y_train))
print('Validation Accuracy', pipeline.score(X_val, y_val))
y_pred = pipeline.predict(X_val)

Training Accuracy 0.9986954216649627
Validation Accuracy 0.9929648523153405


## Test Data.

In [0]:
# read in the cleaned predictions data frame.
df_te = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/WorldBank_2019_2120.csv')

In [0]:
# Set the test data.
test = df_te.copy()
test = test.drop(columns=['Unnamed: 0'])
test.head()

Unnamed: 0,Country Code,Year,Agricultural land (sq. km),GDP per capita growth (annual %),Livestock production index (2004-2006 = 100),Urban population,Crop production index (2004-2006 = 100),Food production index (2004-2006 = 100),Ores and metals exports (% of merchandise exports),Electric power consumption (kWh per capita),Forest area (% of land area)
0,ABW,2019,20.0,-0.75,123.79,48057.97,123.08,125.53,6.37,2712.14,2.33
1,AFG,2019,378959.53,3.31,108.26,9262840.16,148.75,130.08,2.05,2712.14,2.07
2,AGO,2019,591467.93,2.39,146.48,19384636.41,226.44,208.66,2.28,308.43,46.05
3,ALB,2019,11904.29,6.65,120.6,1709176.32,178.59,151.17,12.88,2608.4,28.14
4,AND,2019,184.99,0.81,123.79,75938.01,123.08,125.53,3.59,2712.14,34.04


### Random Forest Pipeline.

In [0]:
# set the variables.
features = test.columns[:-1].tolist()
target = 'Forest area (% of land area)'

X_test = test[features]
y_test = test[target]

pipeline.fit(X_test, y_test)
y_pred = pipeline.predict(X_test)

print ('Test Accuracy', pipeline.score(X_test, y_test))
y_pred

Test Accuracy 0.9998526451418099


array([  2.4116666 ,   2.2803321 ,  44.96266689, ...,   7.47180292,
        42.56087829, -49.36206047])

In [0]:
# Set the predictions on the data frame.
test['Forest area (% of land area)'] = pd.Series(y_pred)
# Merge the train and test data for one dataframe.
predictions = pd.concat([train, test])

In [0]:
# show the data frame shape.
print(predictions.shape)
# show the data frame with headers.
predictions.head()

(34584, 11)


Unnamed: 0,Country Code,Year,Agricultural land (sq. km),GDP per capita growth (annual %),Livestock production index (2004-2006 = 100),Urban population,Crop production index (2004-2006 = 100),Food production index (2004-2006 = 100),Ores and metals exports (% of merchandise exports),Electric power consumption (kWh per capita),Forest area (% of land area)
0,ABW,1990,20.0,2.09,67.49,31273.0,71.69,69.36,1.1,1237.52,2.33
1,AFG,1990,380400.0,1.32,70.69,2628554.0,66.64,68.12,2.75,1237.52,2.07
2,AGO,1990,574040.0,-6.66,70.11,4400964.0,29.25,37.91,6.22,53.17,48.91
3,ALB,1990,11210.0,-11.19,57.97,1197222.0,84.36,68.73,2.75,552.25,28.79
4,AND,1990,230.0,-0.14,67.49,51627.0,71.69,69.36,2.75,1237.52,34.04


In [0]:
# Function to add in some country names.
def label_race (row):
   if row['Country Code'] == 'USA' :
      return 'United States of America'
   if row['Country Code'] == 'CHA' :
      return 'China'
   if row['Country Code'] == 'CAN' :
      return 'Canada'
   if row['Country Code'] == 'AUS' :
      return 'Australia'
   if row['Country Code'] == 'ARG' :
      return 'Argentina'
   if row['Country Code'] == 'BRA':
      return 'Brazil'
   if row['Country Code'] == 'BEL':
      return 'Belgium'   
   if row['Country Code'] == 'CHL':
      return 'Chile'
   if row['Country Code'] == 'DEU' :
      return 'Germany'
   if row['Country Code'] == 'ZAF' :
      return 'South Africa'
   if row['Country Code']  == 'NZL':
      return 'New Zealnd'
   if row['Country Code'] == 'GBR':
      return 'United Kingdom'
   if row['Country Code'] == 'IND' :
      return 'India'
   if row['Country Code'] == 'KHM' :
      return 'Cambodia'
   if row['Country Code']  == 'THA':
      return 'Thailand'
   if row['Country Code'] == 'VNM':
      return 'Vietnam'
   if row['Country Code'] == 'HIC' :
      return 'High Income Countries'
   if row['Country Code']  == 'MIC':
      return 'Middle Income Countries'
   if row['Country Code'] == 'LIC':
      return 'Low Income Countries'
   return 'Other'

In [0]:
# make a copy of the dataframe.
final_c = predictions.copy()

In [0]:
# Apply the function to the datarame.
final_c.apply (lambda row: label_race(row), axis=1)
final_c['Country Name'] = final_c.apply (lambda row: label_race(row), axis=1)
final_c = final_c[['Country Name',
                                  'Country Code',
                                  'Year',
                                  'Agricultural land (sq. km)',
                                  'Electric power consumption (kWh per capita)',
                                  'GDP per capita growth (annual %)',
                                  'Livestock production index (2004-2006 = 100)',
                                  'Ores and metals exports (% of merchandise exports)',
                                  'Urban population',
                                  'Crop production index (2004-2006 = 100)',
                                  'Food production index (2004-2006 = 100)',
                                  'Forest area (% of land area)']]

In [0]:
# Clean the dataframe.
final_c = final_c[final_c['Country Name'] != 'Other']
final_c = final_c[['Country Name',
 'Country Code',
 'Year',
 'Agricultural land (sq. km)',
 'Electric power consumption (kWh per capita)',
 'GDP per capita growth (annual %)',
 'Livestock production index (2004-2006 = 100)',
 'Ores and metals exports (% of merchandise exports)',
 'Urban population',
 'Crop production index (2004-2006 = 100)',
 'Food production index (2004-2006 = 100)',
 'Forest area (% of land area)']]

In [0]:
print(final_c.shape)
final_c.tail()

(2358, 12)


Unnamed: 0,Country Name,Country Code,Year,Agricultural land (sq. km),Electric power consumption (kWh per capita),GDP per capita growth (annual %),Livestock production index (2004-2006 = 100),Ores and metals exports (% of merchandise exports),Urban population,Crop production index (2004-2006 = 100),Food production index (2004-2006 = 100),Forest area (% of land area)
26842,New Zealnd,NZL,2120,-135584.43,10434.64,2.23,300.9,-3.78,8517411.94,299.57,322.48,44.84
26895,Thailand,THA,2120,252837.49,9786.31,-9.02,310.28,4.64,107322891.44,375.28,334.88,44.16
26913,United States of America,USA,2120,3297611.47,16616.36,-0.82,226.75,8.22,563473525.03,265.25,283.85,38.27
26919,Vietnam,VNM,2120,340196.12,7256.84,3.86,670.84,-2.85,106616457.51,512.49,513.35,127.97
26925,South Africa,ZAF,2120,960131.69,4663.79,7.44,391.42,130.12,105234320.88,246.6,323.85,7.47


In [0]:
# Download predictions result as a csv.
from google.colab import files 
final_c.to_csv('Deforestation_Predictions.csv', index=False)
files.download('Deforestation_Predictions.csv')

In [0]:
# Download predictions result as a json.
final_c.to_json('Deforestation_Predictions.json', orient= 'records')
files.download('Deforestation_Predictions.json')

## Connecting Data.

In [0]:
# JSON file.
df = pd.read_json("https://raw.githubusercontent.com/Lambda-School-Labs/earth-dashboard-ds/feature/DeforestationRC2VisualizationNotebooks/Notebooks/Deforestation_Predictions.json")
print(df.shape)
df.head()

(2358, 12)


Unnamed: 0,Country Name,Country Code,Year,Agricultural land (sq. km),Electric power consumption (kWh per capita),GDP per capita growth (annual %),Livestock production index (2004-2006 = 100),Ores and metals exports (% of merchandise exports),Urban population,Crop production index (2004-2006 = 100),Food production index (2004-2006 = 100),Forest area (% of land area)
0,Argentina,ARG,1990,1275650.0,1303.98,-3.87,84.39,2.45,28373007.0,53.11,63.01,12.71
1,Australia,AUS,1990,4644810.0,8527.23,2.05,86.42,20.81,14579227.0,59.46,69.98,16.73
2,Belgium,BEL,1990,78460.0,6380.31,2.83,67.49,4.22,9606261.0,71.69,69.36,31.08
3,Brazil,BRA,1990,2416080.0,1460.76,-4.84,44.88,13.56,110146163.0,59.02,51.4,65.41
4,Canada,CAN,1990,677680.0,16167.37,-1.33,68.1,8.57,21206427.0,82.97,74.11,38.3


In [0]:
# CSV file.
df = pd.read_csv("https://raw.githubusercontent.com/Lambda-School-Labs/earth-dashboard-ds/feature/DeforestationRC2VisualizationNotebooks/Notebooks/Deforestation_Predictions.csv")
print(df.shape)
df.head()

(2358, 12)


Unnamed: 0,Country Name,Country Code,Year,Agricultural land (sq. km),Electric power consumption (kWh per capita),GDP per capita growth (annual %),Livestock production index (2004-2006 = 100),Ores and metals exports (% of merchandise exports),Urban population,Crop production index (2004-2006 = 100),Food production index (2004-2006 = 100),Forest area (% of land area)
0,Argentina,ARG,1990,1275650.0,1303.98,-3.87,84.39,2.45,28373007.0,53.11,63.01,12.71
1,Australia,AUS,1990,4644810.0,8527.23,2.05,86.42,20.81,14579227.0,59.46,69.98,16.73
2,Belgium,BEL,1990,78460.0,6380.31,2.83,67.49,4.22,9606261.0,71.69,69.36,31.08
3,Brazil,BRA,1990,2416080.0,1460.76,-4.84,44.88,13.56,110146163.0,59.02,51.4,65.41
4,Canada,CAN,1990,677680.0,16167.37,-1.33,68.1,8.57,21206427.0,82.97,74.11,38.3


In [7]:
# JSON file.
url = "https://raw.githubusercontent.com/Lambda-School-Labs/earth-dashboard-ds/feature/DeforestationRC2VisualizationNotebooks/Notebooks/Deforestation_Predictions.json"
response = requests.get(url)
# show the status code.
print(response.status_code)
df = response.json()

df

200


[{'Agricultural land (sq. km)': 1275650.0,
  'Country Code': 'ARG',
  'Country Name': 'Argentina',
  'Crop production index (2004-2006 = 100)': 53.11,
  'Electric power consumption (kWh per capita)': 1303.9778990247,
  'Food production index (2004-2006 = 100)': 63.01,
  'Forest area (% of land area)': 12.7135335022,
  'GDP per capita growth (annual %)': -3.8741031654,
  'Livestock production index (2004-2006 = 100)': 84.39,
  'Ores and metals exports (% of merchandise exports)': 2.4468496926,
  'Urban population': 28373007.0,
  'Year': 1990},
 {'Agricultural land (sq. km)': 4644810.0,
  'Country Code': 'AUS',
  'Country Name': 'Australia',
  'Crop production index (2004-2006 = 100)': 59.46,
  'Electric power consumption (kWh per capita)': 8527.2280853906,
  'Food production index (2004-2006 = 100)': 69.98,
  'Forest area (% of land area)': 16.7320984601,
  'GDP per capita growth (annual %)': 2.0490820195,
  'Livestock production index (2004-2006 = 100)': 86.42,
  'Ores and metals expor