In [1]:
import numpy as np
import time
import csv
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def load_data(embeddings_file, labels_file):
    with open(embeddings_file, 'r') as f:
        embeddings = []
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) > 1:
                embeddings.append(list(map(float, parts[1:])))
    
    labels = []
    with open(labels_file, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if row:
                labels.append(int(row[0]))
    
    return np.array(embeddings), np.array(labels)

def calculate_metrics(y_true, y_pred):
    classes = np.unique(y_true)
    n_classes = len(classes)
    
    cm = np.zeros((n_classes, n_classes), dtype=int)
    class_to_idx = {cls: i for i, cls in enumerate(classes)}
    
    for true, pred in zip(y_true, y_pred):
        cm[class_to_idx[true]][class_to_idx[pred]] += 1
    
    precision_sum = 0
    recall_sum = 0
    
    for i in range(n_classes):
        tp = cm[i][i]
        fp = np.sum(cm[:, i]) - tp
        fn = np.sum(cm[i, :]) - tp
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        precision_sum += precision
        recall_sum += recall
    
    precision = precision_sum / n_classes
    recall = recall_sum / n_classes
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = np.trace(cm) / np.sum(cm)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
        'confusion_matrix': cm
    }

In [2]:
X_train, y_train = load_data('train_data.tsv', 'train.csv')
X_test, y_test = load_data('test_data.tsv', 'test.csv')

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [3]:
max_iter_values = [250, 500, 1000, 2000, 4000, 8000]
kernel = 'rbf'

results = []

for max_iter in max_iter_values:
    start_time = time.time()
    model = SVC(kernel=kernel, max_iter=max_iter, random_state=42)
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    y_pred = model.predict(X_test)
    metrics = calculate_metrics(y_test, y_pred)
    
    result = {
        'max_iter': max_iter,
        'training_time': training_time,
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1_score': metrics['f1_score'],
        'accuracy': metrics['accuracy']
    }
    results.append(result)
    
    print(f"max_iter={max_iter:4d}: "
            f"Accuracy={metrics['accuracy']:.4f}, "
            f"F1={metrics['f1_score']:.4f}, "
            f"Precision={metrics['precision']:.4f}, "
            f"Recall={metrics['recall']:.4f}, "
            f"Time={training_time:.2f}с")

max_iter= 250: Accuracy=0.5455, F1=0.5638, Precision=0.5834, Recall=0.5455, Time=14.30с
max_iter= 500: Accuracy=0.6270, F1=0.6528, Precision=0.6807, Recall=0.6270, Time=27.14с
max_iter=1000: Accuracy=0.6795, F1=0.6936, Precision=0.7082, Recall=0.6795, Time=59.01с
max_iter=2000: Accuracy=0.7559, F1=0.7599, Precision=0.7638, Recall=0.7559, Time=108.47с
max_iter=4000: Accuracy=0.8471, F1=0.8497, Precision=0.8522, Recall=0.8471, Time=187.43с
max_iter=8000: Accuracy=0.8893, F1=0.8894, Precision=0.8895, Recall=0.8893, Time=291.68с


In [4]:
import random

def drop_dimensions(X, drop_ratio, random_seed=42):
    random.seed(random_seed)
    n_dim = X.shape[1]
    n_drop = int(n_dim * drop_ratio)
    
    if n_drop >= n_dim:
        return np.zeros((X.shape[0], 1))
    
    drop_indices = random.sample(range(n_dim), n_drop)
    keep_indices = [i for i in range(n_dim) if i not in drop_indices]
    
    return X[:, keep_indices]

optimal_max_iter = 1000
drop_ratios = [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64]

In [6]:
results = []

for drop_ratio in drop_ratios:
    X_train_reduced = drop_dimensions(X_train, drop_ratio, random_seed=43)
    X_test_reduced = drop_dimensions(X_test, drop_ratio, random_seed=43)
    
    start_time = time.time()
    model = SVC(kernel='rbf', max_iter=optimal_max_iter, random_state=42)
    model.fit(X_train_reduced, y_train)
    training_time = time.time() - start_time
    
    y_pred = model.predict(X_test_reduced)
    metrics = calculate_metrics(y_test, y_pred)
    
    result = {
        'drop_ratio': drop_ratio,
        'dimensions': X_train_reduced.shape[1],
        'training_time': training_time,
        'accuracy': metrics['accuracy'],
        'f1_score': metrics['f1_score'],
        'precision': metrics['precision'],
        'recall': metrics['recall']
    }
    results.append(result)
    
    print(f"Отброшено {drop_ratio*100:3.0f}% размерностей: "
            f"Accuracy={metrics['accuracy']:.4f}, "
            f"F1={metrics['f1_score']:.4f}, "
            f"Precision={metrics['precision']:.4f}, "
            f"Recall={metrics['recall']:.4f}, "
            f"Time={training_time:.2f}с")

Отброшено   1% размерностей: Accuracy=0.6859, F1=0.6893, Precision=0.6927, Recall=0.6859, Time=54.77с
Отброшено   2% размерностей: Accuracy=0.7093, F1=0.7096, Precision=0.7099, Recall=0.7093, Time=53.30с
Отброшено   4% размерностей: Accuracy=0.7076, F1=0.7113, Precision=0.7150, Recall=0.7076, Time=51.70с
Отброшено   8% размерностей: Accuracy=0.7014, F1=0.7076, Precision=0.7139, Recall=0.7014, Time=59.19с
Отброшено  16% размерностей: Accuracy=0.6672, F1=0.6732, Precision=0.6792, Recall=0.6672, Time=49.43с
Отброшено  32% размерностей: Accuracy=0.6776, F1=0.6829, Precision=0.6883, Recall=0.6776, Time=45.09с
Отброшено  64% размерностей: Accuracy=0.5804, F1=0.5918, Precision=0.6036, Recall=0.5804, Time=33.14с
