# Установка библиотек

In [20]:
#!pip install numpy scipy matplotlib scikit-learn pandas

# Подключение библиотек

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, recall_score
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from category_encoders import CountEncoder
import gower

# Собственная реализация kNN классификатора

In [22]:
class KNNClassifier:
    def __init__(self, k=1, distance=None):
        self.k = k
        if distance is None:
            self.distance = lambda x, y: np.sqrt(np.sum((x - y) ** 2, axis=1))
        else:
            self.distance = distance
    
    def fit(self, X, y):
        self.X = np.array(X, dtype=np.float64)
        self.y = np.array(y)
        return self
    
    def predict_one_point(self, x):
        x = np.array(x, dtype=np.float64)
        dists = self.distance(self.X, x)
        k_nearest_indexes = np.argsort(dists)[:self.k]
        k_nearest_answers = self.y[k_nearest_indexes]
        unique_answers, counts = np.unique(k_nearest_answers, return_counts=True)
        return unique_answers[np.argmax(counts)]
    
    def predict(self, X):
        X = np.array(X, dtype=np.float64)
        return np.array([self.predict_one_point(x) for x in X])


# Собственная реализация kNN регрессии

In [None]:
class KNNRegressor:
    def __init__(self, k=1, metric='euclidean', distance=None):
        self.k = k
        self.metric = metric
        self.distance = distance 

    def fit(self, X, y):
        # Приведение к float64 — критично для Gower и стабильности
        if self.metric == 'gower':
            self.X = np.asarray(X, dtype=np.float64)
        else:
            self.X = np.asarray(X, dtype=np.float64)  # используем float64 всегда для регрессии
        self.y = np.asarray(y, dtype=np.float64)
        self.n_features_ = self.X.shape[1] if self.X.ndim > 1 else 1
        return self

    def _euclidean_dist(self, X_train, x):
        x = np.asarray(x, dtype=np.float64).ravel()
        # X_train: (n, d), x: (d,) → broadcasting
        diff = X_train - x
        return np.linalg.norm(diff, axis=1)  # надёжнее и чище, чем ручной sqrt(sum(...))

    def _gower_dist(self, X_train, x):
        x = np.asarray(x, dtype=np.float64).ravel()
        x_2d = x.reshape(1, -1)
        # Убедимся, что X_train — float64 (на случай, если fit не сработал как надо)
        X_train = np.asarray(X_train, dtype=np.float64)
        dist_matrix = gower.gower_matrix(X_train, x_2d)
        return dist_matrix[:, 0]

    def predict_one_point(self, x):
        x = np.asarray(x)
        if x.ndim > 1:
            x = x.ravel()
        elif x.ndim == 0:
            x = x.reshape(1)
        
        if self.distance is not None:
            dists = self.distance(self.X, x)
        elif self.metric == 'euclidean':
            dists = self._euclidean_dist(self.X, x)
        elif self.metric == 'gower':
            dists = self._gower_dist(self.X, x)
        else:
            raise ValueError(f"Неизвестная метрика: {self.metric}. Допустимые: 'euclidean', 'gower'.")

        k_nearest_indexes = np.argsort(dists)[:self.k]
        k_nearest_answers = self.y[k_nearest_indexes]
        return np.mean(k_nearest_answers)

    def predict(self, X):
        X = np.asarray(X)
        # Корректное приведение к 2D: (n,) → (n, 1), (n, d) → (n, d)
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        elif X.ndim == 0:
            X = X.reshape(1, 1)
        # Убедимся, что ширина совпадает с обученной моделью
        if X.shape[1] != self.X.shape[1]:
            raise ValueError(f"Ожидалось {self.X.shape[1]} признаков, получено {X.shape[1]}")
        return np.array([self.predict_one_point(x) for x in X])

# Проверка классификации (Базовый вариант)

In [None]:
df = pd.read_csv('classification.csv')

classification_X = df.drop('Bankrupt?', axis=1)
classification_y = df['Bankrupt?']

classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X, classification_y, test_size=0.2, random_state=42, stratify=classification_y
)

k = 5
sk_clf = KNeighborsClassifier(n_neighbors=k)
sk_clf.fit(classification_X_train, classification_y_train)
classification_y_pred_sk = sk_clf.predict(classification_X_test)

my_clf = KNNClassifier(k=k)
my_clf.fit(classification_X_train, classification_y_train)
classification_y_pred_my = my_clf.predict(classification_X_test)

print(f"Sklearn accuracy={accuracy_score(classification_y_test, classification_y_pred_sk):.4f}, f1={f1_score(classification_y_test, classification_y_pred_sk, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(classification_y_test, classification_y_pred_my):.4f}, f1={f1_score(classification_y_test, classification_y_pred_my, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_my, pos_label=1)}")

Sklearn accuracy=0.9663, f1=0.9511, recall=0.0
Custom  accuracy=0.9663, f1=0.9511, recall=0.0


# Проверка регрессии (Базовый вариант)

In [None]:
df = pd.read_csv('regression.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

regression_X = df.drop(columns=['salary_in_usd', 'salary', 'salary_currency'], axis=1)
regression_y = df['salary_in_usd'].to_numpy()

regression_X = pd.get_dummies(regression_X, drop_first=True).to_numpy(dtype=np.float32)

regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X, regression_y, test_size=0.2, random_state=42
)

sk_reg = KNeighborsRegressor(n_neighbors=k)
sk_reg.fit(regression_X_train, regression_y_train)
regression_y_pred_sk = sk_reg.predict(regression_X_test)

my_reg = KNNRegressor(k=k)
my_reg.fit(regression_X_train, regression_y_train)
regression_y_pred_my = my_reg.predict(regression_X_test)

print(f"SkLearn RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_sk)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_my)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_my):.4f}")

SkLearn RMSE=48040.7303, R2=0.3978
Custom  RMSE=45113.8160, R2=0.4690


# Улучшенный вариант с предобработкой данных

## Классификация

### Скейлинг

In [None]:
scaler = StandardScaler()
classification_X_train_scaled = scaler.fit_transform(classification_X_train)
classification_X_test_scaled = scaler.transform(classification_X_test)

### PCA

In [None]:
pca = PCA(n_components=0.95, svd_solver='full')
classification_X_train_pca = pca.fit_transform(classification_X_train_scaled)
classification_X_test_pca = pca.transform(classification_X_test_scaled)

### sklearn

In [None]:
k = 5
sk_clf = KNeighborsClassifier(n_neighbors=k)
sk_clf.fit(classification_X_train_pca, classification_y_train)
classification_y_pred_sk = sk_clf.predict(classification_X_test_pca)

### Оценка качества sklearn

In [None]:
print(f"Sklearn accuracy={accuracy_score(classification_y_test, classification_y_pred_sk):.4f}")
print(f"f1={f1_score(classification_y_test, classification_y_pred_sk, average='weighted'):.4f}")
print(f"recall={recall_score(classification_y_test, classification_y_pred_sk, pos_label=1)}")

Sklearn accuracy=0.9670
f1=0.9582
recall=0.13636363636363635


### Собственная реализация

In [None]:
k = 5
my_clf = KNNClassifier(k=k)
my_clf.fit(classification_X_train_pca, classification_y_train)
classification_y_pred_my = my_clf.predict(classification_X_test_pca)

### Оценка качества собственная реализация

In [None]:
print(f"Custom accuracy={accuracy_score(classification_y_test, classification_y_pred_my):.4f}")
print(f"f1={f1_score(classification_y_test, classification_y_pred_my, average='weighted'):.4f}")
print(f"recall={recall_score(classification_y_test, classification_y_pred_my, pos_label=1)}")

Custom accuracy=0.9670
f1=0.9582
recall=0.13636363636363635


## Регрессия

In [None]:
df = pd.read_csv('regression.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

regression_X = df.drop(columns=['salary_in_usd', 'salary', 'salary_currency'], axis=1)
regression_y = df['salary_in_usd']

### Разделим признаки по типам

In [33]:
num_cols = ['work_year', 'remote_ratio']
cat_cols = ['employment_type', 'job_title', 
            'employee_residence', 'company_location', 'company_size', 'experience_level']

### Подготовка препроцессора

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', CountEncoder(), cat_cols),
    ],
    sparse_threshold=0 
)
regression_X_preprocessed = preprocessor.fit_transform(regression_X)

### Разделение на выборки

In [None]:
regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X_preprocessed, regression_y, test_size=0.2, random_state=42
)

### sklearn

In [None]:
k = 5
sk_r = KNeighborsRegressor(n_neighbors=k)
sk_r.fit(regression_X_train, regression_y_train)
regression_y_pred_sk = sk_r.predict(regression_X_test)

### Оценка результата

In [None]:
print(f"Sklearn RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_sk)):.4f}")
print(f"R2={r2_score(regression_y_test, regression_y_pred_sk):.4f}")

Sklearn RMSE=45863.0356
R2=0.4512


### Собственная реализация

In [None]:
# === Подготовка данных для Gower-регрессии (повторяем загрузку, если нужно) ===
df = pd.read_csv('regression.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

regression_X = df.drop(columns=['salary_in_usd', 'salary', 'salary_currency'], axis=1)
regression_y = df['salary_in_usd']

num_cols = ['work_year', 'remote_ratio']
cat_cols = ['employment_type', 'job_title', 
            'employee_residence', 'company_location', 'company_size', 'experience_level']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', CountEncoder(), cat_cols),
    ],
    remainder='passthrough',
    sparse_threshold=0 
)
regression_X_preprocessed = preprocessor.fit_transform(regression_X)

regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X_preprocessed, regression_y, test_size=0.2, random_state=42
)

# === КЛЮЧЕВОЕ ИСПРАВЛЕНИЕ: приведение к float64 для Gower ===
regression_X_train = regression_X_train.astype(np.float64)
regression_X_test = regression_X_test.astype(np.float64)

# === Обучение и предсказание ===
k = 5
my_r = KNNRegressor(k=k, metric='gower')
my_r.fit(regression_X_train, regression_y_train)
regression_y_pred_my = my_r.predict(regression_X_test)

print(f"Custom RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_my)):.4f}")
print(f"R2={r2_score(regression_y_test, regression_y_pred_my):.4f}")

Custom RMSE=46821.2283
R2=0.4280


### Оценка результата

In [None]:
print(f"Custom RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_my)):.4f}")
print(f"R2={r2_score(regression_y_test, regression_y_pred_my):.4f}")

Custom RMSE=46821.2283
R2=0.4280
