In [1]:
import pandas as pd

data = pd.read_csv("data/Train.csv")
data.head()

Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield
0,ID_GTFAC7PEVWQ9,Nalanda,Noorsarai,45,40,TractorPlough FourWheelTracRotavator,2022-07-20,5,Manual_PuddledRandom,2022-06-27,...,machine,2022-11-16,,2022-11-16,machine,30,40,plowed_in_soil,0.3125,600
1,ID_TK40ARLSPOKS,Nalanda,Rajgir,26,26,WetTillagePuddling TractorPlough FourWheelTrac...,2022-07-18,5,Manual_PuddledRandom,2022-06-20,...,hand,2022-11-25,3.0,2022-12-24,machine,24,10,plowed_in_soil,0.3125,600
2,ID_1FJY2CRIMLZZ,Gaya,Gurua,10,10,TractorPlough FourWheelTracRotavator,2022-06-30,6,Manual_PuddledRandom,2022-06-20,...,hand,2022-12-12,480.0,2023-01-11,machine,30,10,plowed_in_soil,0.148148,225
3,ID_I3IPXS4DB7NE,Gaya,Gurua,15,15,TractorPlough FourWheelTracRotavator,2022-06-16,6,Manual_PuddledRandom,2022-06-17,...,hand,2022-12-02,240.0,2022-12-29,hand,26,10,plowed_in_soil,0.222222,468
4,ID_4T8YQWXWHB4A,Nalanda,Noorsarai,60,60,TractorPlough WetTillagePuddling,2022-07-19,4,Manual_PuddledRandom,2022-06-21,...,machine,2022-11-30,,2022-12-02,machine,24,40,plowed_in_soil,0.46875,550


In [2]:
y = data['Yield']
X = data.drop(['Yield'], axis=1)

### Разделим данные на train/val/test, причем в двух версиях - урезанное число фичей и полное

In [3]:
from sklearn.model_selection import train_test_split

train_data, test_data, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
train_data, val_data, y_train, y_val = train_test_split(train_data, y_train, test_size=2./9, random_state=42)  # 0.25 * 0.8 = 0.2

train_indices = train_data.index
val_indices = val_data.index
test_indices = test_data.index

train_reduced = train_data.select_dtypes(include=['number']).dropna(axis=1)
val_reduced = val_data.select_dtypes(include=['number']).dropna(axis=1)
test_reduced = test_data.select_dtypes(include=['number']).dropna(axis=1)

train_reduced = train_reduced.fillna(train_reduced.mean())
val_reduced = val_reduced.fillna(val_reduced.mean())
test_reduced = test_reduced.fillna(test_reduced.mean())

train = train_data
val = val_data
test = test_data

# Убедимся, что индексы совпадают
assert all(train_reduced.index == train.index)
assert all(val_reduced.index == val.index)
assert all(test_reduced.index == test.index)


### Пишем кастомную модель линейной регрессии с L1 и L2-регуляризацией

In [4]:
import numpy as np

class LinearRegressionElasticNet:
    def __init__(self, epsilon=1e-6, max_steps=10000, w0=None, b0=None, alpha=1e-5, l1_ratio=0.5, lam=1e-2):
        self.epsilon = epsilon
        self.max_steps = max_steps
        self.w0 = w0
        self.alpha = alpha  
        self.l1_ratio = l1_ratio  
        self.lam = lam  
        self.w = None
        self.b0 = b0
        self.b = None
        
    def fit(self, X, y):
        X = self._scale_features(X)
        y = self._scale_target(y)

        l, d = X.shape

        if self.w0 is None:
            self.w0 = np.zeros(d)
        self.w = self.w0.copy()

        if self.b0 is None:
            self.b0 = 0.0
        self.b = self.b0

        for step in range(self.max_steps):
            y_pred = np.dot(X, self.w) + self.b
            residuals = y_pred - y

            grad_loss_w = (2 / l) * np.dot(X.T, residuals)
            grad_reg_w = self.lam * (self.l1_ratio * np.sign(self.w) + (1 - self.l1_ratio) * self.w)
            gradient_w = grad_loss_w + grad_reg_w
            gradient_w = np.clip(gradient_w, -1e5, 1e5)
            if np.isnan(gradient_w).any():
                raise ValueError("NaN detected in gradients. Try reducing the learning rate.")

            gradient_b = (2 / l) * np.sum(residuals)
            w_new = self.w - self.alpha * gradient_w
            b_new = self.b - self.alpha * gradient_b

            if np.linalg.norm(w_new - self.w) < self.epsilon:
                self.w = w_new
                self.b = b_new
                break
                
            self.w = w_new
            self.b = b_new

        self.X_mean = self.X_mean_
        self.X_std = self.X_std_
        self.y_mean = self.y_mean_
        self.y_std = self.y_std_

        return self
    
    def predict(self, X):
        if self.w is None or self.b is None:
            raise Exception('Model has not been trained yet')
        
        X_scaled = (X - self.X_mean) / self.X_std
        y_pred_scaled = np.dot(X_scaled, self.w) + self.b
        return y_pred_scaled * self.y_std + self.y_mean

    def get_b(self):
        return self.b * self.y_std + self.y_mean - np.dot(self.X_mean / self.X_std, self.w * self.y_std)

    def _scale_features(self, X):
        self.X_mean_ = np.mean(X, axis=0)
        self.X_std_ = np.std(X, axis=0)
        self.X_std_[self.X_std_ == 0] = 1.0
        return (X - self.X_mean_) / self.X_std_

    def _scale_target(self, y):
        self.y_mean_ = np.mean(y)
        self.y_std_ = np.std(y)
        if self.y_std_ == 0:
            self.y_std_ = 1.0
        return (y - self.y_mean_) / self.y_std_


In [5]:
custom_model = LinearRegressionElasticNet(
    max_steps=10000,
    alpha=1e-3,      
    l1_ratio=0.005,  
    lam=1e-2        
)

custom_model.fit(train_reduced, y_train)


<__main__.LinearRegressionElasticNet at 0x2ae7411d250>

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = custom_model.predict(test_reduced)

def regression_report(y_true, y_pred):
    report = {
        'Metric': ['MAE', 'MSE', 'RMSE', 'R²', 'MAPE'],
        'Value': [
            mean_absolute_error(y_true, y_pred),
            mean_squared_error(y_true, y_pred),
            mean_squared_error(y_true, y_pred, squared=False),
            r2_score(y_true, y_pred),
            np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        ]
    }
    return pd.DataFrame(report)

report_df = regression_report(y_test, y_pred)
report_df



Unnamed: 0,Metric,Value
0,MAE,182.988153
1,MSE,657641.85919
2,RMSE,810.951206
3,R²,0.274668
4,MAPE,186.49768


In [None]:
import optuna
from sklearn.metrics import mean_absolute_error

def objective_custom(trial):
    alpha = trial.suggest_float('alpha', 1e-6, 1e-2)
    l1_ratio = trial.suggest_float('l1_ratio', 0.0, 0.01)
    steps = trial.suggest_int('steps', 1000, 11000)
    
    model = LinearRegressionElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_steps=steps)
    model.fit(train_reduced, y_train)
    y_pred = model.predict(val_reduced)
    mae = mean_squared_error(y_val, y_pred)
    return mae

study = optuna.create_study(direction='minimize')
study.optimize(objective_custom, n_trials=100)  # Указываем количество итераций

print("Best parameters:", study.best_params)
print("Best MSE:", study.best_value)

best_alpha = study.best_params['alpha']
best_l1_ratio = study.best_params['l1_ratio']
best_steps = study.best_params['steps']

best_model_custom = LinearRegressionElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, max_steps=best_steps)
best_model_custom.fit(train_reduced, y_train)

y_test_pred = best_model_custom.predict(test_reduced)
report_df1 = regression_report(y_test, y_test_pred)
report_df1

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-11-14 19:39:22,118] A new study created in memory with name: no-name-5578f5ff-28a0-4880-8838-e62904b7f912
[I 2024-11-14 19:39:23,141] Trial 0 finished with value: 165207.2651342111 and parameters: {'alpha': 0.009295481840878033, 'l1_ratio': 0.1572165653737818, 'steps': 6826}. Best is trial 0 with value: 165207.2651342111.
[I 2024-11-14 19:39:24,136] Trial 1 finished with value: 165179.0774447846 and parameters: {'alpha': 0.004332597821931924, 'l1_ratio': 0.1863326864556223, 'steps': 4472}. Best is trial 1 with value: 165179.0774447846.
[I 2024-11-14 19:39:25,016] Trial 2 finished with value: 165212.89251923922 and parameters: {'alpha': 0.0045977383985998, 'l1_ratio': 0.07584480554714963, 'steps': 3910}. Best is trial 1 with value: 165179.0774447846.
[I 2024-11-14 19:39:26,176] Trial 3 finished with value: 165183.9760687923 and parameters: {'alpha': 0.004661381166586781, 'l1_ratio': 0.1846933171525582, 'steps': 7897}. Best is tri

Best parameters: {'alpha': 0.002462542542936483, 'l1_ratio': 0.06291668597213018, 'steps': 1009}
Best MSE: 165052.03489889082




Unnamed: 0,Metric,Value
0,MAE,184.06911
1,MSE,659686.823998
2,RMSE,812.211071
3,R²,0.272413
4,MAPE,191.121426
