# Генерация датасета

In [23]:
import random
import pandas as pd


def generate_dataset(samples, features):
    object1_prefix = "obj1_"
    object2_prefix = "obj2_"
    count_of_features_per_object = features
    ds = {}
    fnamegen = feature_name_generator()
    ftypegen = feature_type_generator()
    for i in range(features):
        feature = next(fnamegen)
        type_of_feature = next(ftypegen)
        ds[f"{object1_prefix}{feature}"] = []
        ds[f"{object2_prefix}{feature}"] = []
        for i in range(samples):
            ds[f"{object1_prefix}{feature}"].append(get_random_value_by_type(type_of_feature))
            ds[f"{object2_prefix}{feature}"].append(get_random_value_by_type(type_of_feature))
    ds["collision"] = []
    for i in range(samples):
        ds["collision"].append(get_random_value_by_type("binary"))


    return ds


def feature_type_generator():
    types = ["binary", "ordinal", "numerical"]
    while True:
        for t in types:
            yield t

def get_random_value_by_type(type):
    if type == "binary":
        return random.choice([0, 1])
    elif type == "nominal":
        return random.choice(["A", "B", "C", "D"])
    elif type == "ordinal":
        return random.randint(1, 10)
    elif type == "numerical":
        return random.uniform(0, 1)

def feature_name_generator():
    i = 0
    while True: 
        i = i + 1
        yield f"feature_{i}"

    

In [24]:
count_of_samples = [30, 100, 500, 1000]
count_of_features = [4, 8, 10]

for i in count_of_samples:
    for j in count_of_features:
        pd.DataFrame(generate_dataset(i, j)).to_csv(f"out_{i}_{j}.cvs", index=False)

# Выбор алгоритма

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# 4 классических метода (если не знаешь, что это, иди в гугл, дебил)
models = {
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# 3 самых быстрых (LogReg, DecisionTree, XGBoost)
fast_models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier()
}


In [26]:
df = pd.read_csv('out_1000_10.cvs')

df.head()

Unnamed: 0,obj1_feature_1,obj2_feature_1,obj1_feature_2,obj2_feature_2,obj1_feature_3,obj2_feature_3,obj1_feature_4,obj2_feature_4,obj1_feature_5,obj2_feature_5,...,obj2_feature_6,obj1_feature_7,obj2_feature_7,obj1_feature_8,obj2_feature_8,obj1_feature_9,obj2_feature_9,obj1_feature_10,obj2_feature_10,collision
0,1,1,10,3,0.251787,0.37389,0,0,5,5,...,0.662943,1,0,10,2,0.896348,0.532328,0,0,1
1,1,0,6,6,0.62339,0.31756,1,0,2,5,...,0.086622,1,1,1,1,0.03006,0.821529,1,1,0
2,0,0,2,9,0.91665,0.891822,0,0,3,8,...,0.34174,0,1,7,5,0.022003,0.860016,1,1,0
3,0,1,4,5,0.525199,0.036235,0,0,10,2,...,0.662518,1,0,4,4,0.510248,0.840521,1,0,1
4,0,1,2,8,0.560464,0.624799,1,1,7,9,...,0.997012,1,0,3,7,0.111026,0.429835,0,1,0


In [34]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X = df.drop("collision", axis=1) 
y = df["collision"]              



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Обучающая выборка: {X_train.shape[0]} строк")
print(f"Тестовая выборка: {X_test.shape[0]} строк")

results = {}

for name, model in models.items():
    print(f"Обучение модели: {name}")
    model.fit(X_train, y_train)  
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"Точность {name}: {accuracy:.2f}")
    print("-" * 50)

# Выводим результаты
print("Итоговая точность моделей:")
for name, acc in results.items():
    print(f"{name}: {acc:.2f}")



Обучающая выборка: 800 строк
Тестовая выборка: 200 строк
Обучение модели: RandomForest
Точность RandomForest: 0.55
--------------------------------------------------
Обучение модели: LogisticRegression
Точность LogisticRegression: 0.52
--------------------------------------------------
Обучение модели: SVM
Точность SVM: 0.51
--------------------------------------------------
Обучение модели: KNN
Точность KNN: 0.49
--------------------------------------------------
Итоговая точность моделей:
RandomForest: 0.55
LogisticRegression: 0.52
SVM: 0.51
KNN: 0.49


In [35]:
from timeit import Timer
from functools import partial

def forTime():
    y_pred = model.predict(X_test)
    return y_pred

for name, model in models.items():
    timed_run = Timer(partial(forTime)).timeit(number=1000)
    print(model," Время: ", timed_run)
    

RandomForestClassifier()  Время:  7.8113569999695756
LogisticRegression(max_iter=1000)  Время:  0.6868933999794535
SVC()  Время:  13.357506399974227
KNeighborsClassifier()  Время:  9.611154599988367
