In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_predict


In [None]:
df = pd.read_csv("./csv/rgb/2020/jan/poligon-500-result.csv")

In [None]:
df

In [None]:
df = df.drop("Label_id", axis=1)
df

In [None]:
df = df[df != 0].dropna()
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
df.describe()

In [None]:
df['Label_txt'].value_counts()

In [None]:
plt.hist(df['R'], bins=20, alpha=0.7, label='Red')
plt.hist(df['G'], bins=20, alpha=0.7, label='Green')
plt.hist(df['B'], bins=20, alpha=0.7, label='Blue')
plt.hist(df['Vre1'], bins=20, alpha=0.7, label='Vre1')
plt.hist(df['Vre2'], bins=20, alpha=0.7, label='Vre2')
plt.hist(df['Vre3'], bins=20, alpha=0.7, label='Vre3')
plt.hist(df['Nir'], bins=20, alpha=0.7, label='Nir')
plt.hist(df['Swir1'], bins=20, alpha=0.7, label='Swir1')
plt.hist(df['Swir2'], bins=20, alpha=0.7, label='Swir2')
plt.hist(df['NNir'], bins=20, alpha=0.7, label='NNir')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=df)
plt.show()

In [None]:
X = df[['R', 'G', 'B', 'Vre1', 'Vre2', 'Vre3', 'Nir', 'Swir1', 'Swir2', 'NNir']]
y = df['Label_txt']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 4]
# bootstrap = [True, False]

# random_grid = {
#     'n_estimators': n_estimators,
#     'max_features': max_features,
#     'max_depth': max_depth,
#     'min_samples_split': min_samples_split,
#     'min_samples_leaf': min_samples_leaf,
#     'bootstrap': bootstrap
# }

# rf = rf = RandomForestClassifier()

# best_model = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=42, n_jobs = -1)

best_model = RandomForestClassifier(
    # n_estimators=50,
    # max_depth=5,
    # max_leaf_nodes=4,
    # max_features='log2',
    # min_samples_split=2,
    # min_samples_leaf=3,
)
best_model.fit(X_train, y_train)

In [None]:
predicted_labels = cross_val_predict(best_model, X, y, cv=10)

In [None]:
cm = confusion_matrix(y, predicted_labels)
cm_label = ['Building', 'Forest', 'Vegetation Type 1', 'Vegetation Type 2', 'Water']
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=cm_label, yticklabels=cm_label)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report = classification_report(y, predicted_labels)
print("Classification Report:")
print(report)

In [None]:
from sklearn.metrics import cohen_kappa_score
kappa = cohen_kappa_score(y, predicted_labels)
print("Kappa Cohen :", kappa)

In [None]:
import joblib
model_output_file = './model/model8'
joblib.dump(best_model, f'{model_output_file}.joblib')

In [None]:
import rasterio
import joblib
import pandas as pd
import numpy as np

loaded_model = joblib.load('./model/model8.joblib')

list = []
with rasterio.open('./qgis/2020/jan/clipped.tif') as src:

    metadata = src.meta

    b2 = src.read(1)
    b3 = src.read(2)
    b4 = src.read(3)
    b5 = src.read(4)
    b6 = src.read(5)
    b7 = src.read(6)
    b8 = src.read(7)
    b11 = src.read(8)
    b12 = src.read(9)
    b8a = src.read(10)

    for row in range(b2.shape[0]):
        rows = []
        for col in range(b2.shape[1]):
            pixel = pd.DataFrame({
                'R': [b2[row][col]],
                'G': [b3[row][col]],
                'B': [b4[row][col]],
                'Vre1': [b5[row][col]],
                'Vre2': [b6[row][col]],
                'Vre3': [b7[row][col]],
                'Nir': [b8[row][col]],
                'Swir1': [b11[row][col]],
                'Swir2': [b12[row][col]],
                'NNir': [b8a[row][col]],
            })
            result = loaded_model.predict(pixel)
            if result == 'building':
                rows.append(4)
            if result == 'forest':
                rows.append(3)
            if result == 'vegetation_type_1':
                rows.append(1)
            if result == 'vegetation_type_2':
                rows.append(2)
            if result == 'water':
                rows.append(0)

        list.append(rows)
        print(f"appending row : {row} Finished")

    new_array = np.array(list)

    output_tif_path = './output/lulc/2_jan_2020.tif'

    metadata.update({
        'count': 1,
        'dtype': new_array.dtype.name
    })

    # Create the new GeoTIFF file and write the new array
    with rasterio.open(output_tif_path, 'w', **metadata) as dst:
        dst.write(new_array, 1)

    print(f"Export {output_tif_path} Finish")