# Identifying deforestation using CNNs

Using a variety of deep learning techniques, including transfer learning, to classify satellite images of the Amazon rainforest

## Preparing the data

We are given a folder with 40000+ satellite images and a corresponding csv file 

In [66]:
import numpy as np
from sklearn.datasets import load_files
from keras.utils import np_utils
from glob import glob
import pandas as pd
import re

def load_data_files(path):
    data = load_files(path)
    filenames = np.array(data['filenames'])
    return filenames

def add_numerical(df):
    numerical = []
    for index, row in df.iterrows():
        numerical.append(re.sub("[^0-9]", "", row['filepath']))
        
    df['id'] = pd.Series(numerical).values
    return df
    
total_files = load_data_files('../data/train')
df = pd.DataFrame(np.hstack((total_files)))
df.columns = ['filepath']
df = add_numerical(df)
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df = df.sort('id', ascending=True)
df.set_index('id', inplace=True)

csv_data = pd.read_csv('../data/labels.csv')

df['image_name'] = pd.Series(csv_data['image_name'])
df['tags'] = pd.Series(csv_data['tags'])

print(df.tail())


                                      filepath   image_name  \
id                                                            
40474  ../data/train/train-jpg/train_40474.jpg  train_40474   
40475  ../data/train/train-jpg/train_40475.jpg  train_40475   
40476  ../data/train/train-jpg/train_40476.jpg  train_40476   
40477  ../data/train/train-jpg/train_40477.jpg  train_40477   
40478  ../data/train/train-jpg/train_40478.jpg  train_40478   

                                                tags  
id                                                    
40474                                  clear primary  
40475                                         cloudy  
40476                      agriculture clear primary  
40477                 agriculture clear primary road  
40478  agriculture cultivation partly_cloudy primary  


