In [12]:
import pickle
import numpy as np
from sklearn import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data_dict = pickle.load(open('./data.pickle', 'rb'))

data = data_dict['data']
labels = data_dict['labels']

# Find the maximum sequence length in the data
max_sequence_length = max(len(seq) for seq in data)

# Pad sequences to the maximum length
data = np.array([np.pad(seq, (0, max_sequence_length - len(seq))) for seq in data])

# Modify the feature extraction and preprocessing steps to match the model's requirements
# Ensure 'data' contains the correct number and order of features

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True, stratify=labels)

model = RandomForestClassifier()

model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = accuracy_score(y_predict, y_test)

print('{}% of samples were classified correctly!'.format(score * 100))

# Save the model, including any necessary preprocessing steps
with open('model.p', 'wb') as f:
    pickle.dump({'model': model}, f)


97.52066115702479% of samples were classified correctly!
