<a href="https://colab.research.google.com/github/riyaa14/ML_regression_classification/blob/main/Land_Cover_Classification2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp '/content/drive/MyDrive/Land_Cover_Data/dataset.zip' '/content/'
!unzip -q '/content/dataset.zip' -d '/content/dataset'
import os

dataset_path = '/content/dataset/2750'
train_path = '/content/train'
test_path = '/content/test'

LABELS = os.listdir(dataset_path)
print(LABELS)

['Residential', 'SeaLake', 'Pasture', 'River', 'AnnualCrop', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Forest', 'PermanentCrop']


In [3]:
# creating test and train directories
for path in (train_path, test_path):
    if not os.path.exists(path):
        os.mkdir(path)

# creating subdirectories (labels) in train and test directories
for l in LABELS:

    if not os.path.exists(os.path.join(train_path, l)):
        os.mkdir(os.path.join(train_path, l))

    if not os.path.exists(os.path.join(test_path, l)):
        os.mkdir(os.path.join(test_path, l))

In [21]:
img_paths = {}

for l in LABELS:
  IMG = os.listdir(dataset_path + '/' + l)
  for i in IMG:
    img_paths.update({os.path.join(dataset_path, l, i): l})

import pandas as pd
import shutil
from tqdm import tqdm
import re
from sklearn.model_selection import StratifiedShuffleSplit

X = pd.Series(list(img_paths.keys()))
y = pd.get_dummies(pd.Series(img_paths.values())) # one hot encoding of labels

# splitting the data
split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=69)
for train_idx, test_idx in split.split(X, y):

    train_paths = X[train_idx]
    test_paths = X[test_idx]
    train_paths = list(train_paths)
    test_paths = list(test_paths)


    # defining a new path for each image depending on training or testing.
    # original file path eg: /content/dataset/2750/AnnualCrop/AnnualCrop_1063.jpg
    # after modification eg: /content/train/AnnualCrop/AnnualCrop_1063.jpg
    new_train_paths = [re.sub('dataset/2750', 'train', i) for i in train_paths]
    new_test_paths = [re.sub('dataset/2750', 'test', i) for i in test_paths]

    train_path_map = list((zip(train_paths, new_train_paths)))
    test_path_map = list((zip(test_paths, new_test_paths)))

    # moving files in test and train folders
    print("moving training files..")
    for i in tqdm(train_path_map):
        if not os.path.exists(i[1]):
            if not os.path.exists(re.sub('train', 'test', i[1])):
                shutil.copy(i[0], i[1])

    print("moving testing files..")
    for i in tqdm(test_path_map):
        if not os.path.exists(i[1]):
            if not os.path.exists(re.sub('train', 'test', i[1])):
                shutil.copy(i[0], i[1])

moving training files..


100%|██████████| 20196/20196 [00:03<00:00, 5515.31it/s]


moving testing files..


100%|██████████| 6732/6732 [00:01<00:00, 3792.65it/s]


In [25]:
# processing image data using keras ImageDataGenerator
from keras.preprocessing.image import ImageDataGenerator

rf_gen = ImageDataGenerator(rescale=1./255)

rf_train_gen = rf_gen.flow_from_directory(
    directory='/content/train',
    target_size=(64, 64),
    batch_size=20196,
    class_mode='categorical',
    color_mode='rgb',
    shuffle=False,
    seed=7
)

rf_test_gen = rf_gen.flow_from_directory(
    directory='/content/test',
    target_size=(64, 64),
    batch_size=6732,
    class_mode='categorical',
    color_mode='rgb',
    shuffle=False,
    seed=7
)

train = rf_train_gen.next()
test = rf_test_gen.next()

X_train = train[0].reshape(20196, 12288) # reshape is for onversion from 4D to 2D array
X_test = test[0].reshape(6732, 12288)

y_train = train[1]
y_test = test[1]

X_train

Found 20196 images belonging to 10 classes.
Found 6732 images belonging to 10 classes.


array([[0.32941177, 0.39607847, 0.45882356, ..., 0.5921569 , 0.49803925,
        0.54901963],
       [0.5529412 , 0.43529415, 0.4431373 , ..., 0.48235297, 0.45098042,
        0.43921572],
       [0.70980394, 0.5529412 , 0.5137255 , ..., 0.18431373, 0.27450982,
        0.29803923],
       ...,
       [0.12156864, 0.21568629, 0.30980393, ..., 0.11764707, 0.21176472,
        0.32156864],
       [0.21176472, 0.2784314 , 0.31764707, ..., 0.21176472, 0.2901961 ,
        0.3254902 ],
       [0.4039216 , 0.3921569 , 0.427451  , ..., 0.37647063, 0.37647063,
        0.4156863 ]], dtype=float32)

In [44]:
from sklearn import ensemble
from sklearn.metrics import accuracy_score
import numpy as np

rf_clf = ensemble.RandomForestClassifier(n_estimators=30, n_jobs=-1, random_state=42)

# Randomly sampling indices to create a mini-batch
ids = np.random.choice(len(X_train), size=20196, replace=False)

# Creating a mini-batch
X_mini_batch = X_train[ids]
y_mini_batch = rf_train_gen.classes[ids]

# Performing training on the mini-batch
rf_clf.fit(X_mini_batch, y_mini_batch)

y_pred = rf_clf.predict(X_test)
acc = accuracy_score(y_pred, rf_test_gen.classes)
print("Accuracy Score: {0:.4}".format(acc))

Accuracy Score: 0.6344


## Hyperparameter Tuning


In [43]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [25, 30],
    'max_depth': [None, 20, 10, 15]
}

rf_clf = ensemble.RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

for epoches in range(5):
    ids = np.random.choice(len(X_train), size=1000, replace=False)
    X_mini_batch = X_train[ids]
    y_mini_batch = rf_train_gen.classes[ids]

    grid_search.fit(X_mini_batch, y_mini_batch)
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

# Prediction
best_rf_clf = grid_search.best_estimator_
y_pred = best_rf_clf.predict(X_test)
acc = accuracy_score(y_pred, rf_test_gen.classes)
print("Accuracy Score: {0:.4}".format(acc))

Best Hyperparameters: {'max_depth': 10, 'n_estimators': 30}
Best Hyperparameters: {'max_depth': 10, 'n_estimators': 30}
Best Hyperparameters: {'max_depth': 10, 'n_estimators': 30}
Best Hyperparameters: {'max_depth': 15, 'n_estimators': 30}
Best Hyperparameters: {'max_depth': 15, 'n_estimators': 30}
Accuracy Score: 0.5107
