<a href="https://colab.research.google.com/github/sarrahrose04/PovertyMapping/blob/main/CNNModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount{'c/content/gdrive', force_remount=True}

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#paste in csv file for binned luminosity
import pandas as pd
train_dataset = ""
test_dataset = train_dataset.replace("train90","test10")

df = pd.read_csv(train_dataset)

In [None]:
#Set id = rownumber as index of df
df = df.set_index('id')
df.head()

In [None]:
import os
import shutil
os.makedirs('data', exist_ok=True)

In [None]:
tar_file = "" #paste path of tar.gz file
imagery_folder = os.path.basename(os.path.splitext(os.path.splitext(tar_file)[0])[0])
imagery_path = os.path.join('data', imagery_folder)

shutil.unpack_archive(tar_file, 'data')

#Count no. of daytime imagery files extracted
import glob
jpg_count = str(len(glob.globl(imagery_path, "*.jpg")))
print("Number of daytime imagery: " + jpg_count)

CNN Training

In [None]:
import fastai
from fastai import *
from fastai.vision import *
from fastai.metrics import error_rate
from fastai.callbacks import *

In [None]:
fastai.__version #1.0.61

In [None]:
import re #for string manipulation

root_col = '/content/' #stores root directory path for daytime sat img
val_pct = 0.2 #percentage of dataset to be used for validation 
label_col = 'bin_GMM' #CHECK names of column containing binned luminosity in dataset (from prev csv)
filename_col = 'filename' #CHECK names of column containing imagery filenames in dataset

#extract country_code, year, daytime satellite img source & img file resolution
country, year, day_sat, img_res = re.search("[A-Z]{3}_[0-9]{4}_[A-Z]{2}_[0-9]{3}",tar_file).group().split("_")

#assemble learner and CNN model filenames
learner_filename = "_".join(["CNN_LRNR_RES34",country,year,day_sat,str(img_res)]) + ".pkl"
modelWt_filename = "_".join(["CNN_TCNN_RES34",country,year,day_sat,str(img_res)])

print(learner_filename)
print(modelWt_filename)




In [None]:
#by default FastAI uses horizontal augmentation, we add some more
aug_tfms = [contrast(scale = (0.9, 1.11), p=0.9)
            ,dihedral()
            ,symmetric_warp(magnitude = (-0.2,0.2))
            ]
tfms = get_transforms(flip_vert = True, 
                      max_lighting = 0.1,
                      xtra_tfms = aug_tfms,
                      )
#Define ImageDataBunch

data = ImageDataBunch.from_df(df = df, #using df to define training dataset
                              path = root_col, #root directory
                              folder = imagery_path, 
                              valid_pct = val_pct, #20% of data used in validation
                              fn_col = filename_col, #filename column in dataset
                              label_col = label_col, #classes column in dataset
                              ds_tfms = tfms, #use transformations defined above
                              size = int(img_res) #image size
                              ).normalize(imagenet_status) #use the normalization that was used to train pretrained model

In [None]:
data.show_batch(rows=5, figsize=(20,20))

In [None]:
#Create a CNN learner object with pre-trained model, training & validation datasets, metrics & loss function as arguments
#reference fastai docs

#saves weights of best training cycle in the batch into a .pth file 
callbacks = [SaveModelCallback(learn, monitor = 'error_rate', mode='min', name=modelWt_filename),
             #displays a graph of training & validation dataset loss during training
             ShowGraph(learn),
             #stops the training batch after 3 consecutive training cycles did not improve the model
             EarlyStoppingCallback(learn, min_delta=0.0001, patience=3)
             ]
learn.callbacks = callbacks #functions executed when "event" occurs in training process



In [None]:
learn.fit_one_cycle(14,wd=0.1) #weight decay: model regularisation technique which penalises parameters to prevent overfitting


In [None]:
learn.freeze_to(-2) #Unfreeze last 2 layer groups of model
learn.lr_find() #training with a cyclical lr eliminates need to experimentally find best values & schedule for global learning rates; vary between reasonable boundaries
learn.recorder.plot(suggestion=True) 

#Take note of range of learning rate before loss starts to rise

In [None]:
#Unfreeze the last 2 layer groups
learn.freeze_to(-2) 
learn.fit_one_cycle(6,max_lr=slice(le-6, le-3), wd=0.1)#Train for 6 more epochs & specify LR based on previous graph

In [None]:
#Define interpretation methods for classification models
#Generate confusion matrix & visualisation of the images with inconsistencies
interp = ClassificationInterpretation.from_learner(learn)

#Extract top losses & corresponding image ID
losses,idxs = interp.top_losses()

#Check if validation dataset,losses, and imageIDs are the same number
len(data.valid_ds) == len(losses)==len(idxs)


In [None]:
#Plot satellite images with highest training losses 
#Take note of inconsistencies between input data & output data
interp.plot_top_losses(50, figsize=(35,35))

In [None]:
#To display filenames with high loss function values
losses,idxs = interp.top_losses(50)
for p in data.valid_ds.x.items[idxs]:
  print(p)

In [None]:
#Plot a confusion matrix
interp.plot_confusion_matrix(figsize=(3,3), dpi=100)

In [None]:
#Present the list of largest non-diagonal entries of condusion matrix (actual, predicted & no. of occurences)
interp.most_confused(min_val=2)

In [None]:
#Function for dropping images from dataframe
#ImageDataBunch contains labels and image file path, remove imgs using filenames as subset parameters for dataframe
def drop_image(loss_index): 
  filename_list = [os.path.basename(data.valid_ds.x.items[i]) for i in loss_index]
  #view data to be dropped
  print(df.loc[df['filename'].isin(filename_list)])
  #get filename & row index
  df_filenames = df['filename'].loc[df['filename'].isin(filename_list)]
  index_names = df.loc[df['filename'].isin(filename_list)].index
  df.drop(index_names, inplace = True)
  print("Image filenames dropped from dataframe:")
  for f in df_filenames:
    print(f)

In [None]:
#print indexes of images belonging to top 50 highest losses
#Based on image plot of 50 top losses, select the "anomalous" images to be removed (optional)
print("Row index of top 50 losses: ")
print(idxs)

In [None]:
selected_index = []
drop_image(selected_index)

After removing the “anomalous” data, repeat steps to generate a ImageDataBunch, creating learner and
training for 14 epochs with the dataset.

In [None]:
learn.freeze_to(-3)
learn.lr_find()
learn.recorder.plot(suggestion=True) #Find best LR

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(6,max_lr=slice(le-6,le-4, wd=0.1)) #get LR from above

In [None]:
learn.unfreeze()
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
learn.unfreeze()
learn.fit_one_cycle(3,max_lr=slice(le-8,5e-6),wd=0.1)

In [None]:
#Define interpretation methods for classification of models
interp = ClassificationInterpretation.from_learner(learn)
losses,idxs = interp.top_losses()
len(data.valid_ds)==len(losses)==len(idxs)

In [None]:
#View images again to show top losses from model's prediction, actual value, training loss, and probability
interp.plot_top_losses(50, figsize=[35,35])

In [None]:
#Generate confusion matrix to validate training process
interp.plot_confusion_matrix(figsize=(5,5), dpi=100)

In [None]:
#Save learner object & model weights in Gdrive
learn.export(file=learner_filename) #train and export learner
learn.save(modelWt_filename)

#define folders
save_path = "" #paste
ps.makedirs(save_path, exist_ok=True)

shutil.copy(os.path.join("/content/",learner_filename), save_path)
shutil.copy(os.path.join("/content/models/", modelWt_filename+'.pth'), save_path)

Test trained CNN model using the 10% test dataset

FastAI does not provide direct methods for holdout testing & evaluation. 
Feed fastai's validation dataset with holdout test set & perform standard validation as during CNN training. 

In [None]:
#memory garbage collection; clear virtual memory
learn=None
gc.collect()


In [None]:
#Prepare ImageDataBunch for test dataset & load trained CNN and learner objects

#Create Databunch
df = pd.read_csv(test_dataset) #load test dataset with hooldout images and labels
df_val = df[['bin_GMM','filename']]

#create Imagelist with folder of all images & dataset of filenames and corresponding classes of our test set
img_list = ImageList.from_df(df=df_val, path='/content/data', cols="filename", folder=imagery_folder, suffix = "")
img_list_split = img_list.split_none() #all data on train set
list_label = img_list_split.label_from_df(0)
list_label.valid = list_label.train #trick where load training dataset as validation dataset
print(list_label) #check what is inside train, validation and test set at moment

#transformations
list_label.transform(tfms=None,size=int(img_res))
data = list_label.databunc(bs=bs_val);
data.normalise(imagenet_stats)

learn = cnn_learner(data, models.resnet34, metrics = error_rate)
learn = load_learner('/content/', file=learner_filename) #learner object must be used for inference purposes
learn.load(modelWt_filename) #load weights of model
learn.data.valid_dl = data.valid_dl # override with inference data with transforms and other..
learn.loss_func = torch.nn.CrossEntropyLoss()
learn.metrics #check which metrics set up

interp = ClassificationInterpretation.from_learner(learn, ds_type=DatasetType.Valid) #perform interpretation for validation
interp.plot_confusion_matrix() #matrix representing prediction on holdout test set



In [None]:
#Plot 25 images with high losses & overlay a heatmap to indicate areas CNN considers important for actual nightlight class
interp.plot_top_losses(25, figsize(25,25), heatmap=True)

In [None]:
#Evaluate overall accuracy of the model 
tfms = None
data_test = data 

def evaluate_model_from_interp(interp, data): 
  #perform evaluation of model to take a look at predictions vs labels & compute accuracy
  print(f"Interp has {len(interp.y_true)} ground ttruth labels: {interp.y_true}")
  print(f"Interp yielded {len(interp.preds)} raw predictions. First 2 raw predictions are: {interp.preds[:2]}")
  print(f"The problem had {len(data.classes)} classes: (data classes") 
  print( " ")
  print(f"Pred --> GroundTruth --> PredLabel --> GroundTruthLabel")

  ok_pred = 0

  for idx, raw_p in enumerate(interp.preds):
    pred = np.argmax(raw_p)
    if idx <10, #display first 10 predictions and corresponding real labels
      print(f'{pred} --> {interp.y_true[idx]} = {data.classes[pred]} -> {data.valid_ds.y[idx]}')
    if pred == interp.y_true[idx]: #count correct predictions
      ok_pred += 1
  
  acc = ok_pred / len(interp.y_true) #calculate accuracy by correct predictions divided by total predictions
  print(f"Overall accuracy of the model: {acc:0.5f}")

#call function
evaluate_model_from_interp(interp, data)