In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

UsageError: Line magic function `%` not found.


In [62]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [63]:
X = pd.read_csv('X_train.csv').astype('float64')
y = np.array(pd.read_csv('Y_train.csv')).ravel()

In [64]:
def data_clearing(X):
    bad_columns = X.columns[X.isna().sum() > 10000]
    X = X.drop(columns=bad_columns, axis=0)  # в некоторых колонках слишком много nan, удалим эти колонки
    X = X.drop(X.columns[X.nunique() == 1], axis=1)  # удалим, где только 1 значение
    return X


def correlation(data):
    """Строим матрицу корреляций"""
    plt.figure(figsize=(15, 6))
    sns.heatmap(data.corr(), annot=True)
    plt.show()


def delete_corr_columns(X, value_of_corr=0.9):
    corr_matrix = X.corr().abs()  # получаем таблицу корреляции и модуля корреляции
    upper_tri = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  # создаем маску для верхнего треугольника таблицы
    to_drop = [column for column in upper_tri.columns if any(upper_tri[
                                                                 column] >= value_of_corr)]  # находим колонки, которые имеют корреляцию больше или равную value_of_corr
    X = X.drop(to_drop, axis=1)  # удаляем колонки из DataFrame
    return X


X = data_clearing(X)
#correlation(X)
X = delete_corr_columns(X)
#correlation(X)

In [65]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, shuffle=True)

In [66]:
def standartization_data(X_train, X_valid):
    scaler = StandardScaler()
    X_train_norm = scaler.fit_transform(X_train)
    X_valid_norm = scaler.transform(X_valid)
    return X_train_norm, X_valid_norm


def min_max_scaler(X_train, X_valid):
    scaler = MinMaxScaler()
    X_train_min_max = scaler.fit_transform(X_train)
    X_test_min_max = scaler.transform(X_valid)
    return X_train_min_max, X_test_min_max


X_train_norm, X_valid_norm = standartization_data(X_train, X_valid)
X_train_min_max, X_valid_min_max = min_max_scaler(X_train, X_valid)

In [67]:
def metrics(y_valid, y_pred):
    mse = mean_squared_error(y_valid, y_pred)
    mae = mean_absolute_error(y_valid, y_pred)
    r2 = r2_score(y_true=y_valid, y_pred=y_pred)

    # Выводим значения метрик
    print("MSE:", mse)
    print("MAE:", mae)
    print("R2 score:", r2)

In [68]:
def plot_residuals(y_valid, y_pred):
    residuals = y_valid - y_pred

    # Строим график остатков
    plt.scatter(y_pred, residuals)
    plt.title("Residual plot")
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.axhline(y=0, color='r', linestyle='-')
    plt.show()


def plot_true_vs_pred(y_valid, y_pred):
    plt.scatter(y_valid, y_pred)
    plt.title("True vs predicted values")
    plt.xlabel("True values")
    plt.ylabel("Predicted values")
    plt.plot([y_valid.min(), y_valid.max()], [y_valid.min(), y_valid.max()], 'r--')
    plt.show()

In [69]:
#LinearRegression
def Linear_Regression(X_train, X_valid, y_train, y_valid):
    linear_regressor = LinearRegression(fit_intercept=True, copy_X=True, n_jobs=-1)
    linear_regressor.fit(X_train, y_train)

    y_pred = linear_regressor.predict(X_valid)

    metrics(y_valid=y_valid, y_pred=y_pred)


Linear_Regression(X_train, X_valid, y_train, y_valid)
Linear_Regression(X_train_norm, X_valid_norm, y_train, y_valid)
Linear_Regression(X_train_min_max, X_valid_min_max, y_train, y_valid)

MSE: 0.003629757185708279
MAE: 0.036961861958909825
R2 score: 0.4146579576709387
MSE: 0.003629757185708281
MAE: 0.036961861958910006
R2 score: 0.4146579576709384
MSE: 0.003629757185708282
MAE: 0.03696186195891003
R2 score: 0.41465795767093816


In [70]:
#Tree
def Tree(X_train, X_valid, y_train, y_valid):
    parameters = {'max_depth': range(3, 15, 1), 'min_samples_split': range(2, 9, 2), 'min_samples_leaf': range(1, 8, 2)}
    tree = DecisionTreeRegressor()
    tree_grid = GridSearchCV(tree, param_grid=parameters, cv=3, n_jobs=-1)
    tree_grid.fit(X_train, y_train)

    y_pred = tree_grid.best_estimator_.predict(X_valid)

    metrics(y_valid=y_valid, y_pred=y_pred)

print('Initial')
Tree(X_train, X_valid, y_train, y_valid)
print('Normired')
Tree(X_train_norm, X_valid_norm, y_train, y_valid)
print('MinMaxScaler')
Tree(X_train_min_max, X_valid_min_max, y_train, y_valid)

Initial
MSE: 0.00022930256444625305
MAE: 0.010305776273156791
R2 score: 0.963022201068232
Normired
MSE: 0.00022869910601888861
MAE: 0.010301936122499999
R2 score: 0.9631195160042627
MinMaxScaler
MSE: 0.00022609746896562008
MAE: 0.010534004631613761
R2 score: 0.9635390612984094


In [71]:
#RandomForest
def RandomForest(X_train, X_valid, y_train, y_valid):
    parameters = {'max_depth': range(2, 15, 2), 'n_estimators': range(10, 100, 10), 'min_samples_split': range(2, 9, 2),
                  'min_samples_leaf': range(1, 8, 2)}
    forest = RandomForestRegressor()
    forest_grid = RandomizedSearchCV(forest, param_distributions=parameters, cv=3, n_jobs=-1)
    forest_grid.fit(X_train, y_train)

    y_pred = forest_grid.best_estimator_.predict(X_valid)

    metrics(y_valid=y_valid, y_pred=y_pred)


print('Initial')
RandomForest(X_train, X_valid, y_train, y_valid)
print('Normired')
RandomForest(X_train_norm, X_valid_norm, y_train, y_valid)
print('MinMaxScaler')
RandomForest(X_train_min_max, X_valid_min_max, y_train, y_valid)

Initial
MSE: 0.00018313524050947173
MAE: 0.009692133458152957
R2 score: 0.9704672378294856
Normired
MSE: 0.00017662609649488986
MAE: 0.00959571864208091
R2 score: 0.971516915660915
MinMaxScaler
MSE: 0.000180513024752151
MAE: 0.00964036281366034
R2 score: 0.9708901016873935


In [72]:
#GradientBoosting
def Gradient_Boost(X_train, X_valid, y_train, y_valid):
    parameters = {'n_estimators': range(80, 140, 10),
                  'max_depth': range(4, 14, 2),
                  'criterion': ['friedman_mse', 'squared_error'],
                  'learning_rate': np.arange(0.09, 0.12, 0.005),
                  'min_samples_leaf': range(2, 14, 2),
                  'min_samples_split': range(2, 14, 2),
                  'random_state': [0]}
    grad_boost = GradientBoostingRegressor()
    grad_boost_grid = RandomizedSearchCV(grad_boost, param_distributions=parameters, cv=3, n_jobs=-1)
    grad_boost_grid.fit(X_train, y_train)

    y_pred = grad_boost_grid.best_estimator_.predict(X_valid)

    metrics(y_valid=y_valid, y_pred=y_pred)

print('Initial')
Gradient_Boost(X_train, X_valid, y_train, y_valid)
print('Normired')
Gradient_Boost(X_train_norm, X_valid_norm, y_train, y_valid)
print('MinMaxScaler')
Gradient_Boost(X_train_min_max, X_valid_min_max, y_train, y_valid)

Initial
MSE: 0.0001460088524250124
MAE: 0.008746320301586124
R2 score: 0.9764543148469854
Normired
MSE: 0.0001515734888792323
MAE: 0.008870382044893807
R2 score: 0.975556950229937
MinMaxScaler
MSE: 0.0001527941663386823
MAE: 0.008887858645514807
R2 score: 0.97536010129471


In [73]:
import torch
import torch.nn as nn


class SimpleNet(nn.Module):
    def __init__(self, n_hidden_neurons):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(21, n_hidden_neurons)
        #self.act1 = nn.Sigmoid()
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(n_hidden_neurons, n_hidden_neurons // 2)
        self.act2 = nn.ReLU()
        self.fc3 = nn.Linear(n_hidden_neurons // 2, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        x = self.act2(x)
        x = self.fc3(x)
        return x


model = SimpleNet(300)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [74]:
X_train_tensor = torch.tensor(X_train_norm, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape((8690, 1))
X_valid_tensor = torch.tensor(X_valid_norm, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid, dtype=torch.float32).reshape((2173, 1))

In [75]:
num_epochs = 10000

for epoch in range(num_epochs):
    optimizer.zero_grad()

    y_pred_tensor = model.forward(X_train_tensor)
    train_loss_val = criterion(y_pred_tensor, y_train_tensor)

    y_valid_pred_tensor = model.forward(X_valid_tensor)
    valid_loss_val = criterion(y_valid_pred_tensor, y_valid_tensor)

    train_loss_val.backward()

    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss_val:.4f}, Valid Loss: {valid_loss_val:.4f}')

Epoch 1/10000, Training Loss: 42.1746, Valid Loss: 42.2241
Epoch 11/10000, Training Loss: 2.1602, Valid Loss: 2.1584
Epoch 21/10000, Training Loss: 1.4798, Valid Loss: 1.4794
Epoch 31/10000, Training Loss: 0.4600, Valid Loss: 0.4614
Epoch 41/10000, Training Loss: 0.1548, Valid Loss: 0.1416
Epoch 51/10000, Training Loss: 0.1123, Valid Loss: 0.1018
Epoch 61/10000, Training Loss: 0.0892, Valid Loss: 0.0821
Epoch 71/10000, Training Loss: 0.0731, Valid Loss: 0.0685
Epoch 81/10000, Training Loss: 0.0625, Valid Loss: 0.0601
Epoch 91/10000, Training Loss: 0.0537, Valid Loss: 0.0518
Epoch 101/10000, Training Loss: 0.0461, Valid Loss: 0.0448
Epoch 111/10000, Training Loss: 0.0407, Valid Loss: 0.0397
Epoch 121/10000, Training Loss: 0.0361, Valid Loss: 0.0355
Epoch 131/10000, Training Loss: 0.0322, Valid Loss: 0.0320
Epoch 141/10000, Training Loss: 0.0289, Valid Loss: 0.0289
Epoch 151/10000, Training Loss: 0.0260, Valid Loss: 0.0262
Epoch 161/10000, Training Loss: 0.0236, Valid Loss: 0.0239
Epoch 

KeyboardInterrupt: 

In [None]:
def mae(pred, target):
    abs = torch.abs(pred - target)
    return abs.mean()


def r_squared(y_pred, y_true):
    return r2_score(y_true.detach().numpy(), y_pred.detach().numpy())

In [None]:
print(criterion(model.forward(X_valid_tensor), y_valid_tensor))
print(mae(model.forward(X_valid_tensor), y_valid_tensor))
print(r_squared(model.forward(X_valid_tensor), y_valid_tensor))