In [None]:
from PIL import Image, ImageOps
from tensorflow.keras.preprocessing.image import img_to_array, load_img, array_to_img
import os
import glob
import numpy as np
import pickle
import math
import requests
from PIL import Image

def make_dataset(data_dir= "C:/Users/wicm/raw2/", size=28):
    X = []
    Y = []
    
    for folder in os.listdir(data_dir):
        if os.path.isdir(data_dir + folder) == True:
            label = folder
            for file in glob.glob(data_dir + folder + "/*.png"):
                img = load_img(file, grayscale=True, target_size=(size, size))
                img = ImageOps.invert(img)
                x = img_to_array(img)

                X.append(x)
                Y.append(label)
    X = np.asarray(X)
    Y = np.asarray(Y)
    data = {"X": X, "Y": Y};
    pickle.dump(data, open("thainumber_{}.pkl".format(size), "wb"), protocol = 2)

In [None]:
def load_dataset(size = 28):
    data = pickle.load(open("thainumber_{}.pkl".format(size), "rb"))
    X = data['X']
    Y = data['Y']
    return X, Y

def prepare_input(file):
    img = load_img(file, grayscale=True, target_size=(28, 28))
    img = ImageOps.invert(img)
    x = img_to_array(img)
    return x

def img_cloud_dataset(size = 28):
    X, Y = load_dataset(size)
    x = 0
    y = 0
    new_im = Image.new('L', (size * 50, size * math.ceil(X.shape[0] / 50)))
    for i in range(0, X.shape[0]):
        if (i != 0 and i % 50 == 0):
            y += size
            x = 0

        im = array_to_img(X[i])
        new_im.paste(im, (x, y))
        x += size
    new_im.save("cloud_dataset_{}.png".format(size))

In [None]:
make_dataset()

In [None]:
X,Y = load_dataset()

In [None]:
X

In [None]:
Y

In [None]:
import matplotlib.pyplot as plt
plt.imshow(X[0], cmap='gray', vmin=0, vmax=255)

In [None]:
#Reshape X to change array to dataframe
reshaped_X = X.reshape((X.shape[0], -1))
reshaped_X.shape

In [None]:
#Change array to dataframe

import pandas as pd
Ydf = pd.DataFrame(Y)
Xdf = pd.DataFrame(reshaped_X)

In [None]:
Xdf

In [None]:
Ydf

In [None]:
#Pycaret

from pycaret.classification import *
#clf = setup(Xdf, target = Ydf, train_size = 0.8,
clf = setup(reshaped_X, target = Y, train_size = 0.8,
            numeric_imputation = 'median',
            categorical_imputation = 'mode')

In [None]:
top5_model = compare_models(sort = 'Accuracy', fold = 5, n_select = 5)

In [None]:
#Scree Plot

from sklearn.decomposition import PCA
pca = PCA(n_components = Xdf.shape[1]) #Maximum component is all features
pca.fit_transform(Xdf) #Fitting PCA
explain_ratio = pca.explained_variance_ratio_
explain_ratio_cum = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize = (15, 100))
ax = plt.axes()
ax.set_facecolor('#dfe3e6')
plt.grid(color = 'w')
plt.xlabel('Number of components')
plt.ylabel('variance explained')
plt.title('Scree plot')

plt.plot(range(1, explain_ratio.shape[0] + 1), explain_ratio, c = 'royalblue', marker = 'o', linewidth = 2.5, label = 'Individual')
plt.plot(range(1, explain_ratio.shape[0] + 1), explain_ratio_cum, c = 'firebrick', marker = 'o', linestyle = '--', label = 'Cumulative')

for x, ex_ratio, ex_ratio_cum in zip(range(1, explain_ratio.shape[0] + 1),
                                     explain_ratio,
                                     explain_ratio_cum):
    ex_ratio_label = f'{ex_ratio * 100:.2f}%'
    plt.annotate(ex_ratio_label, (x, ex_ratio), textcoords = 'offset points',
               xytext = (5, 5), ha = 'center')
    ex_ratio_cum_label = f'{ex_ratio_cum * 100:.2f}%'
    plt.annotate(ex_ratio_cum_label, (x, ex_ratio_cum), textcoords = 'offset points',
               xytext = (5, 5), ha = 'center')

plt.show()

In [None]:
#Standardization
X_mean = Xdf.mean()
X_std = Xdf.std()
Z = (Xdf-X_mean)/X_std
Z = Z.fillna(0)
Z

In [None]:
#Covariance
c = Z.cov()
c

In [None]:
#Eiganvalues & Eiganvectors
eiganvalues, eiganvectors = np.linalg.eig(c)
print('Eigan Values:\n', eiganvalues)
print('Eigan Values Shape:', eiganvalues.shape)
print('Eigan Vectors Shape:', eiganvectors.shape)

In [None]:
#Explained Variance
idx = eiganvalues.argsort()[::-1]
eiganvalues = eiganvalues[idx]
eiganvectors = eiganvectors[:,idx]

explained_var = np.cumsum(eiganvalues)/np.sum(eiganvalues)
explained_var

In [None]:
#Find n for 73% Explained Variance
n_components = np.argmax(explained_var >= 0.73) +1
n_components

In [None]:
#Dataframe to Array แปะไว้เฉยๆยังไม่ใช้
Xarray = Xdf.values
Xinverse = Xarray.reshape(-1, 28, 28)
plt.imshow(Xinverse[0], cmap='gray', vmin=0, vmax=255)

In [None]:
#Apply PCA

pca = PCA(n_components = 123)
pca.fit(Z)
X_pca = pca.transform(Z)
X_pca
#pca_df = pd.DataFrame(X_pca)
#pca_df

In [None]:
clf = setup(X_pca, target = Y)

In [None]:
top5_model = compare_models(sort = 'Accuracy', fold = 5, n_select = 5)

In [None]:
# Random Forest

import matplotlib.pyplot as plt
import seaborn as sns
from tune_sklearn import TuneSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import datasets
from scipy.stats import randint

In [None]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, Y, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
clf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

In [None]:
# Random Forest GridSearch

from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)
print(classification_report(y_test, y_pred))

In [None]:
# Random Forest TuneSearch

model = RandomForestClassifier()
param_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf = TuneSearchCV(model, param_rf, n_trials=20, scoring="accuracy")
rf.fit(X_train, y_train)
print(rf.cv_results_)

In [None]:
pred = rf.predict(X_test)
correct = 0
for i in range(len(y_test)):
    if pred[i] == y_test[i]:
        correct += 1
print("Accuracy:", correct / len(pred))

In [None]:
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

In [None]:
# PCA Random forest

from sklearn.decomposition import PCA
from sklearn import metrics

def n_component_analysis(n,X_train, y_train, X_val, y_val):

    pca = PCA(n_components=n)
    print("PCA begin with n_components: {}".format(n))
    pca.fit(X_train)

    X_train_pca = pca.transform(X_train)
    X_val_pca = pca.transform(X_val)


    print('Random Forest')
    clf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=1)
    clf.fit(X_train_pca, y_train)
    predictions=clf.predict(X_val_pca)
    accuracy = metrics.accuracy_score(y_val,predictions)

    print("accuracy: {}".format(accuracy))
    return accuracy

train_x,test_x,train_y,test_y=train_test_split(Z,Y,test_size=0.20,random_state=42)


n_s = np.linspace(0.70, 0.90, num=15)
accuracy = []
for n in n_s:
    tmp = n_component_analysis(n,train_x, train_y, test_x, test_y)
    accuracy.append(tmp)

plt.figure()
plt.plot(n_s,np.array(accuracy),label='Accuracy vs % variance explained')
plt.legend()
plt.show()