In [8]:
import pickle
import numpy as np

def custom_train_test_split(data, labels, test_size=0.2, random_state=None):
    if random_state:
        np.random.seed(random_state)

    # Shuffle indices
    indices = np.arange(len(data))
    np.random.shuffle(indices)

    # Calculate the number of samples for the test set
    test_samples = int(len(data) * test_size)

    # Split indices into training and testing sets
    test_indices, train_indices = indices[:test_samples], indices[test_samples:]

    # Create training and testing sets
    x_train, x_test = data[train_indices], data[test_indices]
    y_train, y_test = labels[train_indices], labels[test_indices]

    return x_train, x_test, y_train, y_test

# Load data
data_dict = pickle.load(open('./data.pickle', 'rb'))
data = np.asarray(data_dict['data'])
labels = np.asarray(data_dict['labels'])

# Convert labels to integers
label_mapping = {label: idx for idx, label in enumerate(np.unique(labels))}
int_labels = np.array([label_mapping[label] for label in labels])

# Split the data
x_train, x_test, y_train, y_test = custom_train_test_split(data, int_labels, test_size=0.2, random_state=42)

# Your model training code goes here

# For demonstration, printing the shapes of the resulting sets
print("Training set shapes - X: {}, Y: {}".format(x_train.shape, y_train.shape))
print("Testing set shapes - X: {}, Y: {}".format(x_test.shape, y_test.shape))


Training set shapes - X: (800, 42), Y: (800,)
Testing set shapes - X: (200, 42), Y: (200,)


In [10]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Load data
data_dict = pickle.load(open('./data.pickle', 'rb'))
data = np.asarray(data_dict['data'])
labels = np.asarray(data_dict['labels'])

# Convert labels to integers
label_mapping = {label: idx for idx, label in enumerate(np.unique(labels))}
int_labels = np.array([label_mapping[label] for label in labels])

# Use StratifiedKFold for splitting
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in stratified_kfold.split(data, int_labels):
    x_train, x_test = data[train_index], data[test_index]
    y_train, y_test = int_labels[train_index], int_labels[test_index]

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

# Print training accuracy
train_predictions = model.predict(x_train)
train_accuracy = accuracy_score(train_predictions, y_train)
print('Training Accuracy: {}%'.format(train_accuracy * 100))

# Make predictions on the test set
y_predict = model.predict(x_test)

# Evaluate the model
test_accuracy = accuracy_score(y_predict, y_test)
print('Testing Accuracy: {}%'.format(test_accuracy * 100))

# Save the model
with open('model.p', 'wb') as f:
    pickle.dump({'model': model}, f)


Training Accuracy: 98.5%
Testing Accuracy: 98.5%
