In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
import os
import pandas as pd
import numpy as np

In [22]:
# Folder containing the CSV files
folder_path = "Letters/data"

# Initialize empty list for X (list of 2D arrays) and Y (list of first letters)
X = []
Y = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        # Read the CSV file using pandas
        file_path = os.path.join(folder_path, filename)
        data = pd.read_csv(file_path, header=None)  # header=None if the CSV doesn't have column headers

        # Convert the pandas DataFrame to a numpy array and append to X
        X.append(data.to_numpy()[:, 0:-1])

        # Get the first letter of the filename and append to Y
        Y.append(data.to_numpy()[0, -1])  # Assuming the first letter is the label

# PREPROCESS

# Find the minimum number of rows
min_rows = min(el.shape[0] for el in X)

# Truncate each array to have only `min_rows` rows
truncated_data = [el[:min_rows, :] for el in X]
X = truncated_data

# Convert X to a numpy array
X = np.array(X).astype(float)

# Define different scalers for different columns
scaler_first_column = MinMaxScaler(feature_range=(0, 1))  # Scaling first column to [0, 1]
scaler_other_column = MinMaxScaler(feature_range=(-1, 1))  # Scaling first column to [0, 1]


for i in range(X.shape[0]):
    X[i, :, 0:1] = scaler_first_column.fit_transform(X[i, :, 0:1])
    X[i, :, 1:] = scaler_other_column.fit_transform(X[i, :, 1:])

X = X.reshape(X.shape[0], -1)
Y = np.array(Y).astype(str)

# Print the results
print("X (List of 2D numpy arrays):")
print(X.shape)
    
print("Y (List of first letters):")
print(Y.shape)

X (List of 2D numpy arrays):
(80, 304)
Y (List of first letters):
(80,)


In [23]:
# Split data into train/test
labels = ['c', 'o', 'l', 'u', 'm', 'b', 'i', 'a']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=24, shuffle=True, stratify=Y)

In [None]:
# Hypertraining (SKIP THIS IF IT TAKES TOO LONG)
labels = ['c', 'o', 'l', 'u', 'm', 'b', 'i', 'a']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=24, shuffle=True, stratify=Y)

param_grid = { "criterion" : ["gini", "entropy"], 
	      "min_samples_leaf" : [1, 5, 10], 
	      "min_samples_split" : [2, 4, 10, 12], 
	      "n_estimators": [400, 700, 1000]
}
from sklearn.model_selection import GridSearchCV, cross_val_score
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=1, n_jobs=-1)
clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
clf.fit(x_train, y_train)
clf.best_params_

In [35]:
# Random Forest
random_forest = RandomForestClassifier(criterion = "gini", 
                                       n_estimators=1000,
				       min_samples_leaf=1,
				       min_samples_split=4,
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=-1)

random_forest.fit(x_train, y_train)
y_prediction = random_forest.predict(x_test)

random_forest.score(x_test, y_test)

print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

predictions = cross_val_predict(random_forest, x_test, y_test, cv=3)
confusion_matrix(y_test, predictions, labels=labels)


oob score: 87.5 %


array([[2, 1, 0, 0, 0, 0, 0, 0],
       [0, 3, 0, 0, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 2],
       [0, 1, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 1, 0, 2, 0],
       [0, 0, 0, 0, 0, 1, 0, 2]])

In [37]:
# Save model
# Here you can replace pickle with joblib or cloudpickle
from pickle import dump
with open("model_v0.pkl", "wb") as f:
    dump(random_forest, f, protocol=5)