In [1]:
import os
import random
import numpy as np
import pandas as pd
from osgeo import gdal, gdalconst
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Read in NLCD landcover, permanent forest loss, population density, and GFW data 

directory = 'D:\\788P\\PG'

nlcd04_file = os.path.join(directory, "nlcd\\NLCD04.tif")
nlcd08_file = os.path.join(directory, "nlcd\\NLCD08.tif")
nlcd13_file = os.path.join(directory, "nlcd\\NLCD13.tif")
nlcd16_file = os.path.join(directory, "nlcd\\NLCD16.tif")

loss06_file = os.path.join(directory, "loss\\Loss06.tif")
loss11_file = os.path.join(directory, "loss\\Loss11.tif")
loss16_file = os.path.join(directory, "loss\\Loss16.tif")

pop05_file = os.path.join(directory, "pop\\Pop05.tif")
pop10_file = os.path.join(directory, "pop\\Pop10.tif")
pop15_file = os.path.join(directory, "pop\\Pop15.tif")

gfw_loss_file = os.path.join(directory, "GFW_lossyear_2019.tif")

# Make sure the datasets are read in appropriately and the same extent

nlcd_ds = gdal.Open(nlcd04_file, gdalconst.GA_ReadOnly)
nlcd04 = gdal.Open(nlcd04_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(nlcd04.shape)
nlcd08 = gdal.Open(nlcd08_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(nlcd08.shape)
nlcd13 = gdal.Open(nlcd13_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(nlcd13.shape)
nlcd16 = gdal.Open(nlcd16_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(nlcd16.shape)

loss06 = gdal.Open(loss06_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(loss06.shape)
loss11 = gdal.Open(loss11_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(loss11.shape)
loss16 = gdal.Open(loss16_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(loss16.shape)

pop05 = gdal.Open(pop05_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(pop05.shape)
pop10 = gdal.Open(pop10_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(pop05.shape)
pop15 = gdal.Open(pop15_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(pop05.shape)

gfw_loss = gdal.Open(gfw_loss_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(gfw_loss.shape)

# Save the extent, projection, and geotransform information of the rasters
cols = nlcd_ds.RasterXSize
rows = nlcd_ds.RasterYSize
geot = nlcd_ds.GetGeoTransform()
proj = nlcd_ds.GetProjection()

(2221, 1236)
(2221, 1236)
(2221, 1236)
(2221, 1236)
(2221, 1236)
(2221, 1236)
(2221, 1236)
(2221, 1236)
(2221, 1236)
(2221, 1236)
(2221, 1236)


In [2]:
# Create a dataframe that stores the training data information: 
# permanent forest loss or not, landcover type, landcover types of the 9 neighbors
train = pd.DataFrame()
ForestLoss = []
LC = []
N1 = []
N2 = []
N3 = []
N4 = []
N6 = []
N7 = []
N8 = []
N9 = []
Pop = []

In [3]:
# Get the permanent loss pixels between 05-06
# Save their information
for x in range (0, rows):
    for y in range (0, cols):
        if loss06[x][y] == 1:
            ForestLoss.append(loss06[x][y])
            LC.append(nlcd04[x][y])
            N1.append(nlcd04[x-1][y-1])
            N2.append(nlcd04[x-1][y])
            N3.append(nlcd04[x-1][y+1])
            N4.append(nlcd04[x][y-1])
            N6.append(nlcd04[x][y+1])
            N7.append(nlcd04[x+1][y-1])
            N8.append(nlcd04[x+1][y])
            N9.append(nlcd04[x+1][y+1])
            Pop.append(pop05[x][y])

In [4]:
# Collect 20,000 non-forest-loss samples from nlcd 2004 landcover
# The samples are randomly selected
i = 0
while i<20000:
    x = random.randint(1, rows-1)
    y = random.randint(1, cols-1)
    if loss06[x][y] == 0:
        i = i + 1
        ForestLoss.append(loss06[x][y])
        LC.append(nlcd04[x][y])
        N1.append(nlcd04[x-1][y-1])
        N2.append(nlcd04[x-1][y])
        N3.append(nlcd04[x-1][y+1])
        N4.append(nlcd04[x][y-1])
        N6.append(nlcd04[x][y+1])
        N7.append(nlcd04[x+1][y-1])
        N8.append(nlcd04[x+1][y])
        N9.append(nlcd04[x+1][y+1])
        Pop.append(pop05[x][y])

In [5]:
# Get the permanent loss pixels between 09-11
# Save their information
for x in range (0, rows):
    for y in range (0, cols):
        if loss11[x][y] == 1:
            ForestLoss.append(loss11[x][y])
            LC.append(nlcd08[x][y])
            N1.append(nlcd08[x-1][y-1])
            N2.append(nlcd08[x-1][y])
            N3.append(nlcd08[x-1][y+1])
            N4.append(nlcd08[x][y-1])
            N6.append(nlcd08[x][y+1])
            N7.append(nlcd08[x+1][y-1])
            N8.append(nlcd08[x+1][y])
            N9.append(nlcd08[x+1][y+1])
            Pop.append(pop10[x][y])

In [6]:
# Collect 20,000 non-forest-loss samples from nlcd 2008 landcover
# The samples are randomly selected
while i<20000:
    x = random.randint(1, rows-1)
    y = random.randint(1, cols-1)
    if loss11[x][y] == 0:
        i = i + 1
        ForestLoss.append(loss11[x][y])
        LC.append(nlcd08[x][y])
        N1.append(nlcd08[x-1][y-1])
        N2.append(nlcd08[x-1][y])
        N3.append(nlcd08[x-1][y+1])
        N4.append(nlcd08[x][y-1])
        N6.append(nlcd08[x][y+1])
        N7.append(nlcd08[x+1][y-1])
        N8.append(nlcd08[x+1][y])
        N9.append(nlcd08[x+1][y+1])
        Pop.append(pop10[x][y])

In [7]:
# Get the permanent loss pixels between 14-16
# Save their information
for x in range (0, rows):
    for y in range (0, cols):
        if loss16[x][y] == 1:
            ForestLoss.append(loss16[x][y])
            LC.append(nlcd13[x][y])
            N1.append(nlcd13[x-1][y-1])
            N2.append(nlcd13[x-1][y])
            N3.append(nlcd13[x-1][y+1])
            N4.append(nlcd13[x][y-1])
            N6.append(nlcd13[x][y+1])
            N7.append(nlcd13[x+1][y-1])
            N8.append(nlcd13[x+1][y])
            N9.append(nlcd13[x+1][y+1])
            Pop.append(pop15[x][y])

In [8]:
# Collect 20,000 non-forest-loss samples from nlcd 2013 landcover
# The samples are randomly selected
while i<20000:
    x = random.randint(1, rows-1)
    y = random.randint(1, cols-1)
    if loss16[x][y] == 0:
        i = i + 1
        ForestLoss.append(loss16[x][y])
        LC.append(nlcd13[x][y])
        N1.append(nlcd13[x-1][y-1])
        N2.append(nlcd13[x-1][y])
        N3.append(nlcd13[x-1][y+1])
        N4.append(nlcd13[x][y-1])
        N6.append(nlcd13[x][y+1])
        N7.append(nlcd13[x+1][y-1])
        N8.append(nlcd13[x+1][y])
        N9.append(nlcd13[x+1][y+1])
        Pop.append(pop15[x][y])

In [9]:
train['ForestLoss'] = ForestLoss
train['LandCover'] = LC
train['N1'] = N1
train['N2'] = N2
train['N3'] = N3
train['N4'] = N4
train['N6'] = N6
train['N7'] = N7
train['N8'] = N8
train['N9'] = N9
train['Pop'] = Pop
train
# -9999 and 255 represent "no data"

Unnamed: 0,ForestLoss,LandCover,N1,N2,N3,N4,N6,N7,N8,N9,Pop
0,1.0,41.0,-9999.0,21.0,21.0,-9999.0,41.0,41.0,41.0,41.0,1031.512207
1,1.0,41.0,21.0,21.0,22.0,41.0,22.0,41.0,41.0,21.0,1031.512207
2,1.0,41.0,-9999.0,-9999.0,41.0,-9999.0,41.0,-9999.0,81.0,41.0,1031.512207
3,1.0,41.0,-9999.0,41.0,41.0,41.0,41.0,81.0,41.0,41.0,1031.512207
4,1.0,41.0,41.0,41.0,22.0,41.0,21.0,41.0,41.0,41.0,1031.512207
...,...,...,...,...,...,...,...,...,...,...,...
33816,1.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,43.0,21.495285
33817,1.0,42.0,71.0,42.0,41.0,71.0,41.0,71.0,42.0,42.0,21.495285
33818,1.0,42.0,71.0,42.0,41.0,71.0,42.0,71.0,42.0,42.0,21.495285
33819,1.0,42.0,71.0,42.0,42.0,71.0,42.0,71.0,41.0,43.0,21.495285


In [10]:
# mask -9999 and 255 with NaN
columns1 = ['LandCover', 'N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N8', 'N9']
columns2 = ['LandCover', 'N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N8', 'N9', 'Pop']
train[columns1] = train[columns1].mask(train[columns1] > 100, np.nan)
train[columns2] = train[columns2].mask(train[columns2] < 0, np.nan)
train

Unnamed: 0,ForestLoss,LandCover,N1,N2,N3,N4,N6,N7,N8,N9,Pop
0,1.0,41.0,,21.0,21.0,,41.0,41.0,41.0,41.0,1031.512207
1,1.0,41.0,21.0,21.0,22.0,41.0,22.0,41.0,41.0,21.0,1031.512207
2,1.0,41.0,,,41.0,,41.0,,81.0,41.0,1031.512207
3,1.0,41.0,,41.0,41.0,41.0,41.0,81.0,41.0,41.0,1031.512207
4,1.0,41.0,41.0,41.0,22.0,41.0,21.0,41.0,41.0,41.0,1031.512207
...,...,...,...,...,...,...,...,...,...,...,...
33816,1.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,43.0,21.495285
33817,1.0,42.0,71.0,42.0,41.0,71.0,41.0,71.0,42.0,42.0,21.495285
33818,1.0,42.0,71.0,42.0,41.0,71.0,42.0,71.0,42.0,42.0,21.495285
33819,1.0,42.0,71.0,42.0,42.0,71.0,42.0,71.0,41.0,43.0,21.495285


In [11]:
# Drop rows with NAN
train = train.dropna()
train

Unnamed: 0,ForestLoss,LandCover,N1,N2,N3,N4,N6,N7,N8,N9,Pop
1,1.0,41.0,21.0,21.0,22.0,41.0,22.0,41.0,41.0,21.0,1031.512207
4,1.0,41.0,41.0,41.0,22.0,41.0,21.0,41.0,41.0,41.0,1031.512207
5,1.0,41.0,41.0,41.0,41.0,81.0,41.0,21.0,21.0,21.0,1031.512207
6,1.0,41.0,41.0,41.0,21.0,41.0,41.0,21.0,21.0,41.0,1031.512207
9,1.0,41.0,41.0,22.0,21.0,41.0,21.0,41.0,21.0,22.0,763.727661
...,...,...,...,...,...,...,...,...,...,...,...
33816,1.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,43.0,21.495285
33817,1.0,42.0,71.0,42.0,41.0,71.0,41.0,71.0,42.0,42.0,21.495285
33818,1.0,42.0,71.0,42.0,41.0,71.0,42.0,71.0,42.0,42.0,21.495285
33819,1.0,42.0,71.0,42.0,42.0,71.0,42.0,71.0,41.0,43.0,21.495285


In [12]:
# Create a dataframe storing test data information:
# Landcover and landcover of neighbors
test = pd.DataFrame()
LandCover = []
neighbor1 = []
neighbor2 = []
neighbor3 = []
neighbor4 = []
neighbor6 = []
neighbor7 = []
neighbor8 = []
neighbor9 = []
PopDen = []
nrow = []
ncol = []

In [13]:
# Collect test data from nlcd 2016 landcover
for x in range (1, rows-1):
    for y in range (1, cols-1):
        LandCover.append(nlcd16[x][y])
        neighbor1.append(nlcd16[x-1][y-1])
        neighbor2.append(nlcd16[x-1][y])
        neighbor3.append(nlcd16[x-1][y+1])
        neighbor4.append(nlcd16[x][y-1])
        neighbor6.append(nlcd16[x][y+1])
        neighbor7.append(nlcd16[x+1][y-1])
        neighbor8.append(nlcd16[x+1][y])
        neighbor9.append(nlcd16[x+1][y+1])
        PopDen.append(pop15[x][y])
        nrow.append(x)
        ncol.append(y)

In [14]:
test['LandCover'] = LandCover
test['N1'] = neighbor1
test['N2'] = neighbor2
test['N3'] = neighbor3
test['N4'] = neighbor4
test['N6'] = neighbor6
test['N7'] = neighbor7
test['N8'] = neighbor8
test['N9'] = neighbor9
test['Pop'] = PopDen
test['nrow'] = nrow
test['ncol'] = ncol
test
# -9999 and 255 means "no data"

Unnamed: 0,LandCover,N1,N2,N3,N4,N6,N7,N8,N9,Pop,nrow,ncol
0,255,255,255,255,255,255,255,255,255,-9999.0,1,1
1,255,255,255,255,255,255,255,255,255,-9999.0,1,2
2,255,255,255,255,255,255,255,255,255,-9999.0,1,3
3,255,255,255,255,255,255,255,255,255,-9999.0,1,4
4,255,255,255,255,255,255,255,255,255,-9999.0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
2738241,255,255,255,255,255,255,255,255,255,-9999.0,2219,1230
2738242,255,255,255,255,255,255,255,255,255,-9999.0,2219,1231
2738243,255,255,255,255,255,255,255,255,255,-9999.0,2219,1232
2738244,255,255,255,255,255,255,255,255,255,-9999.0,2219,1233


In [15]:
# Mask -9999 and 255 with NaN
# Drop the rows with NaN

columns1 = ['LandCover', 'N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N8', 'N9']
columns2 = ['LandCover', 'N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N8', 'N9', 'Pop']
test[columns1] = test[columns1].mask(test[columns1] > 100, np.nan)
test[columns2] = test[columns2].mask(test[columns2] < 0, np.nan)
test = test.dropna()
'''
Equivalent to:
test['nrow'] = test['nrow'].astype(str)
test['ncol'] = test['ncol'].astype(str)
cols = test.select_dtypes(include=['number']).columns
test[cols] = test[cols].mask(test[cols] > 10)
test = test.dropna()
'''

test

Unnamed: 0,LandCover,N1,N2,N3,N4,N6,N7,N8,N9,Pop,nrow,ncol
1857,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,2,624
3090,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,3,623
3091,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,3,624
3092,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,3,625
3093,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,3,626
...,...,...,...,...,...,...,...,...,...,...,...,...
2693767,23.0,22.0,22.0,22.0,22.0,22.0,24.0,24.0,24.0,0.000000,2183,1180
2693768,22.0,22.0,22.0,22.0,23.0,22.0,24.0,24.0,22.0,0.000000,2183,1181
2693769,22.0,22.0,22.0,22.0,22.0,22.0,24.0,22.0,22.0,0.000000,2183,1182
2693770,22.0,22.0,22.0,95.0,22.0,95.0,22.0,22.0,90.0,0.000000,2183,1183


In [16]:
# Prepare the training data and testing data
x_train = train[['LandCover', 'N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N8', 'N9', 'Pop']]
y_train = train[['ForestLoss']]
x_test = test[['LandCover', 'N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N8', 'N9', 'Pop']]

# Slipt the training data into 70% in-sample training and 30% in-sample testing
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.3)

# Train the randome forest model with in-sample training data
rf=RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=1)
rf.fit(X_train,Y_train)
# Test the model with in-sample testing data, and output the accuracy
Y_pred=rf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
# Good in-sample accuracy

  


Accuracy: 0.9103720977390246


In [17]:
# Project the real testing data with the model 
y_pred = rf.predict(x_test)
y_pred

array([1., 1., 1., ..., 0., 0., 0.])

In [19]:
# Save the projection result into the dataframe
test['ForestLoss'] = y_pred
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,LandCover,N1,N2,N3,N4,N6,N7,N8,N9,Pop,nrow,ncol,ForestLoss
1857,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,2,624,1.0
3090,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,3,623,1.0
3091,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,3,624,1.0
3092,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,3,625,1.0
3093,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,219.086731,3,626,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2693767,23.0,22.0,22.0,22.0,22.0,22.0,24.0,24.0,24.0,0.000000,2183,1180,0.0
2693768,22.0,22.0,22.0,22.0,23.0,22.0,24.0,24.0,22.0,0.000000,2183,1181,0.0
2693769,22.0,22.0,22.0,22.0,22.0,22.0,24.0,22.0,22.0,0.000000,2183,1182,0.0
2693770,22.0,22.0,22.0,95.0,22.0,95.0,22.0,22.0,90.0,0.000000,2183,1183,0.0


In [21]:
# Output the projection result into a new raster
out_dir = os.path.join(directory, "rf_result\\projected_loss2019_7_100.tif")
print(out_dir)

driver = gdal.GetDriverByName("GTiff")
new_ds = driver.Create(out_dir, cols, rows, 1, gdal.GDT_Float32)
new_ds.SetGeoTransform(geot)
new_ds.SetProjection(proj)

outband = new_ds.GetRasterBand(1)
outarray = new_ds.ReadAsArray()

# Write in the new raster with the projection result
for i in range(len(test)):
    x = int(test.iloc[i]['nrow'])
    y = int(test.iloc[i]['ncol'])
    value = int(test.iloc[i]['ForestLoss'])
    outarray[x][y] = value

outband.WriteArray(outarray)
outband.FlushCache()
del new_ds

D:\788P\PG\rf_result\projected_loss2019_7_100.tif


In [22]:
# Validate the projection result by GFW data by 
# counting how many pixels are agreed and disagreed by the projection result and GFW

projected_loss_file = os.path.join(directory, "rf_result\\projected_loss2019_7_100.tif")
projected_loss = gdal.Open(projected_loss_file, gdalconst.GA_ReadOnly).ReadAsArray()
print(projected_loss.shape)

overlap = 0
gfw_only = 0
projected_only = 0

for x in range (0, rows):
    for y in range (0, cols):
        if projected_loss[x][y] == 1 and gfw_loss[x][y] >= 17: # forest loss in both
            overlap = overlap + 1
        if projected_loss[x][y] == 0 and gfw_loss[x][y] >= 17 and gfw_loss[x][y] <= 19: # forest loss only in GFW
            gfw_only = gfw_only + 1
        if projected_loss[x][y] == 1 and gfw_loss[x][y] < 17 and gfw_loss[x][y] > 0: # forest loss only in projection result
            projected_only = projected_only + 1

            
print(overlap)
print(projected_only)
print(gfw_only)

(2221, 1236)
1755
6571
5862
