In [None]:
from google.colab import files
import zipfile
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

# –ó–∞–≥—Ä—É–∑–∫–∞ –∞—Ä—Ö–∏–≤–∞
uploaded = files.upload()
archive_name = list(uploaded.keys())[0]

# –†–∞—Å–ø–∞–∫–æ–≤–∫–∞ –∞—Ä—Ö–∏–≤–∞
with zipfile.ZipFile(archive_name, 'r') as zip_ref:
    zip_ref.extractall('data')

data_dir = 'data'

def load_lattice_data(lattice_dir, label):
    lattice_data = []
    lattice_labels = []
    for lattice_num in os.listdir(lattice_dir):
        lattice_path = os.path.join(lattice_dir, lattice_num)
        if os.path.isdir(lattice_path):
            kde_data = []
            for filename in os.listdir(lattice_path):
                filepath = os.path.join(lattice_path, filename)
                kde_values = pd.read_csv(filepath, header=0).iloc[:, 0].values[1:]  # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º –∑–∞–≥–æ–ª–æ–≤–æ–∫
                kde_data.append(kde_values.astype(float))  # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –≤ float
            kde_matrix = np.column_stack(kde_data)
            lattice_data.append(kde_matrix)
            lattice_labels.append(label)
    return lattice_data, lattice_labels

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç–∏–ø–∞ —Ä–µ—à—ë—Ç–∫–∏
lattice_data_7x7x7, labels_7x7x7 = load_lattice_data(os.path.join(data_dir, '7x7x7'), 'NaCl')
lattice_data_UN2, labels_UN2 = load_lattice_data(os.path.join(data_dir, 'UN2'), 'UN2')
lattice_data_U2N3, labels_U2N3 = load_lattice_data(os.path.join(data_dir, 'U2N3'), 'U2N3')

# –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö
X = lattice_data_7x7x7 + lattice_data_UN2 + lattice_data_U2N3
y = np.hstack([np.array(labels_7x7x7), np.array(labels_UN2), np.array(labels_U2N3)])

def pad_matrix(matrix, max_columns):
    padded_matrix = np.pad(matrix, ((0, 0), (0, max_columns - matrix.shape[1])), mode='constant')
    return padded_matrix

# –ù–∞—Ö–æ–¥–∏–º –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∏–æ–Ω–æ–≤ –≤ –æ–¥–Ω–æ–π —Ä–µ—à—ë—Ç–∫–µ
max_ions = max(matrix.shape[1] for matrix in X)

# –î–æ–ø–æ–ª–Ω—è–µ–º –≤—Å–µ –º–∞—Ç—Ä–∏—Ü—ã –¥–æ –æ–¥–∏–Ω–∞–∫–æ–≤–æ–≥–æ —Ä–∞–∑–º–µ—Ä–∞
X_padded = [pad_matrix(matrix, max_ions) for matrix in X]

# –£–ø–ª–æ—â–∞–µ–º –º–∞—Ç—Ä–∏—Ü—ã
def matrix_to_vector(matrix):
    return matrix.flatten()

X_flattened = [matrix_to_vector(matrix) for matrix in X_padded]
X_flattened = np.array(X_flattened)

# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ –æ–±—É—á–∞—é—â—É—é –∏ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫–∏
X_train, X_test, y_train, y_test = train_test_split(X_flattened, y, test_size=0.2, random_state=42, stratify=y)

# –°–æ–∑–¥–∞–Ω–∏–µ –∏ –æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ
y_pred = model.predict(X_test)

# –û—Ü–µ–Ω–∫–∞ —Ç–æ—á–Ω–æ—Å—Ç–∏
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# –ü–æ–¥–±–æ—Ä –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f"–õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã: {grid_search.best_params_}")

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å –ª—É—á—à–∏–º–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Best Accuracy: {accuracy_best:.2f}")

# –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –¥—Ä—É–≥–∏—Ö –º–æ–¥–µ–ª–µ–π
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {accuracy_knn:.2f}")

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
joblib.dump(best_model, 'kde_classifier.pkl')


Saving Archive.zip to Archive.zip
Accuracy: 1.00
–õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã: {'max_depth': None, 'n_estimators': 50}
Best Accuracy: 1.00
SVM Accuracy: 1.00
KNN Accuracy: 1.00


['kde_classifier.pkl']

In [None]:
import os
import pandas as pd
import numpy as np
import joblib

# –ü—É—Ç—å –∫ —É–∂–µ —Ä–∞—Å–ø–∞–∫–æ–≤–∞–Ω–Ω–æ–π –ø–∞–ø–∫–µ —Å –¥–∞–Ω–Ω—ã–º–∏
new_data_dir = '/content/new_data'  # –∏–ª–∏ –¥—Ä—É–≥–æ–π –ø—É—Ç—å, –≥–¥–µ –ª–µ–∂–∞—Ç –≤–∞—à–∏ –¥–∞–Ω–Ω—ã–µ

def load_lattice_data(lattice_dir):
    kde_data = []
    for ion_file in os.listdir(lattice_dir):
        filepath = os.path.join(lattice_dir, ion_file)
        if os.path.isfile(filepath) and ion_file.endswith('.csv'):
            try:
                # –ß—Ç–µ–Ω–∏–µ CSV —Ñ–∞–π–ª–∞ —Å –∑–∞–≥–æ–ª–æ–≤–∫–æ–º –∏ –ø—Ä–æ–ø—É—Å–∫ –ø–µ—Ä–≤–æ–π —Å—Ç—Ä–æ–∫–∏
                kde_values = pd.read_csv(filepath, header=0).iloc[:, 0].values[1:].astype(float)
                kde_data.append(kde_values)
                print(f"–£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω —Ñ–∞–π–ª: {ion_file}, —Ä–∞–∑–º–µ—Ä: {len(kde_values)}")
            except Exception as e:
                print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ —á—Ç–µ–Ω–∏–∏ —Ñ–∞–π–ª–∞ {filepath}: {e}")

    if not kde_data:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω–æ –Ω–∏ –æ–¥–Ω–æ–≥–æ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ–≥–æ —Ñ–∞–π–ª–∞ –≤ –ø–∞–ø–∫–µ {lattice_dir}")

    kde_matrix = np.column_stack(kde_data)
    print(f"–°–æ–∑–¥–∞–Ω–∞ –º–∞—Ç—Ä–∏—Ü–∞ —Ä–∞–∑–º–µ—Ä–æ–º: {kde_matrix.shape}")
    return kde_matrix

# –ü—É—Ç—å –∫ –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–π –ø–∞–ø–∫–µ —Å —Ä–µ—à–µ—Ç–∫–æ–π
lattice_dir = os.path.join(new_data_dir, '21')

# –ü—Ä–æ–≤–µ—Ä–∫–∞ —Å—É—â–µ—Å—Ç–≤–æ–≤–∞–Ω–∏—è –ø–∞–ø–∫–∏
if not os.path.exists(lattice_dir):
    print(f"–ü–∞–ø–∫–∞ {lattice_dir} –Ω–µ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç!")
    print("–î–æ—Å—Ç—É–ø–Ω—ã–µ –ø–∞–ø–∫–∏ –≤ new_data:")
    for item in os.listdir(new_data_dir):
        item_path = os.path.join(new_data_dir, item)
        if os.path.isdir(item_path):
            print(f"  üìÅ {item}")
            # –ü–æ–∫–∞–∑–∞—Ç—å –ø–æ–¥–ø–∞–ø–∫–∏
            for subitem in os.listdir(item_path):
                subitem_path = os.path.join(item_path, subitem)
                if os.path.isdir(subitem_path):
                    print(f"    üìÇ {subitem}")
else:
    print(f"–ü—É—Ç—å –∫ –ø–∞–ø–∫–µ —Å –¥–∞–Ω–Ω—ã–º–∏: {lattice_dir}")
    print(f"–°–ø–∏—Å–æ–∫ —Ñ–∞–π–ª–æ–≤ –≤ –ø–∞–ø–∫–µ: {os.listdir(lattice_dir)[:10]}...")  # –ü–æ–∫–∞–∑—ã–≤–∞–µ–º –ø–µ—Ä–≤—ã–µ 10 —Ñ–∞–π–ª–æ–≤

    # –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –Ω–æ–≤–æ–π —Ä–µ—à—ë—Ç–∫–∏
    try:
        kde_matrix = load_lattice_data(lattice_dir)
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∑–∞–≥—Ä—É–∑–∫–µ –¥–∞–Ω–Ω—ã—Ö: {e}")
        raise

    # –ó–∞–≥—Ä—É–∑–∫–∞ —Å–æ—Ö—Ä–∞–Ω—ë–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
    model = joblib.load('kde_classifier.pkl')

    def pad_matrix(matrix, max_columns):
        padded_matrix = np.pad(matrix, ((0, 0), (0, max_columns - matrix.shape[1])), mode='constant')
        return padded_matrix

    # –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∏–æ–Ω–æ–≤ –≤ –æ–¥–Ω–æ–π —Ä–µ—à—ë—Ç–∫–µ –∏–∑ –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏
    max_ions = 343  # –ó–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ –∞–∫—Ç—É–∞–ª—å–Ω–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ –∏–∑ –≤–∞—à–µ–π –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏

    # –î–æ–ø–æ–ª–Ω—è–µ–º –º–∞—Ç—Ä–∏—Ü—É –¥–æ –æ–¥–∏–Ω–∞–∫–æ–≤–æ–≥–æ —Ä–∞–∑–º–µ—Ä–∞
    kde_matrix_padded = pad_matrix(kde_matrix, max_ions)
    print(f"–ú–∞—Ç—Ä–∏—Ü–∞ –ø–æ—Å–ª–µ –¥–æ–ø–æ–ª–Ω–µ–Ω–∏—è: {kde_matrix_padded.shape}")

    # –£–ø–ª–æ—â–∞–µ–º –º–∞—Ç—Ä–∏—Ü—É
    def matrix_to_vector(matrix):
        return matrix.flatten()

    kde_vector = matrix_to_vector(kde_matrix_padded)
    kde_vector = kde_vector.reshape(1, -1)  # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –≤ –¥–≤—É–º–µ—Ä–Ω—ã–π –º–∞—Å—Å–∏–≤ (–æ–¥–Ω–∞ —Å—Ç—Ä–æ–∫–∞)
    print(f"–í–µ–∫—Ç–æ—Ä –¥–ª—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏: {kde_vector.shape}")

    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Ç–∏–ø–∞ —Ä–µ—à—ë—Ç–∫–∏
    predicted_label = model.predict(kde_vector)
    prediction_proba = model.predict_proba(kde_vector)

    print(f"–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–π —Ç–∏–ø —Ä–µ—à–µ—Ç–∫–∏: {predicted_label[0]}")
    print(f"–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:")

    for i, (class_name, prob) in enumerate(zip(model.classes_, prediction_proba[0])):
        print(f"  {class_name}: {prob:.4f} ({prob*100:.2f}%)")

–ü—É—Ç—å –∫ –ø–∞–ø–∫–µ —Å –¥–∞–Ω–Ω—ã–º–∏: /content/new_data/21
–°–ø–∏—Å–æ–∫ —Ñ–∞–π–ª–æ–≤ –≤ –ø–∞–ø–∫–µ: ['kde_array_777_Cl_0.84807903_0.15876380_0.65043946.csv', 'kde_array_777_Na_0.47827458_0.00000000_0.33950642.csv', 'kde_array_777_Na_0.69308471_0.98859405_0.15734638.csv', 'kde_array_777_Na_0.49747574_0.83338360_0.50582287.csv', 'kde_array_777_Cl_0.78789024_0.33203705_0.82587566.csv', 'kde_array_777_Na_0.16351575_0.84692486_0.49599104.csv', 'kde_array_777_Na_0.33624272_0.47048582_0.31558941.csv', 'kde_array_777_Cl_0.83805043_0.88304947_1.02850603.csv', 'kde_array_777_Na_0.17387392_0.85082347_0.16254473.csv', 'kde_array_777_Cl_0.32857575_0.34173389_0.00000000.csv']...
–£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω —Ñ–∞–π–ª: kde_array_777_Cl_0.84807903_0.15876380_0.65043946.csv, —Ä–∞–∑–º–µ—Ä: 999
–£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω —Ñ–∞–π–ª: kde_array_777_Na_0.47827458_0.00000000_0.33950642.csv, —Ä–∞–∑–º–µ—Ä: 999
–£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω —Ñ–∞–π–ª: kde_array_777_Na_0.69308471_0.98859405_0.15734638.