In [2]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFE

In [4]:
df = pd.read_csv('../../data/interim/data_metro.csv')

In [6]:
scaler_std = StandardScaler()

X = df.drop(columns=['price'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = scaler_std.fit_transform(X_train)
X_test = scaler_std.transform(X_test)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Baseline MAPE: {mape}')

Baseline MAPE: 0.3246299930186311


In [8]:
# Sequential Feature Selector (forward selection)
sfs = SequentialFeatureSelector(model, n_features_to_select=30, direction='forward', scoring='neg_mean_absolute_percentage_error', cv=5)
sfs.fit(X_train, y_train)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)


# Получение отобранных признаков
selected_features = sfs.get_support()

# Создание нового набора данных с отобранными признаками
X_train_selected = X_train.loc[:, selected_features]
X_test_selected = X_test.loc[:, selected_features]

# Обучение модели на отобранных признаках
selected_model = LinearRegression()
selected_model.fit(X_train_selected, y_train)
y_pred_selected = selected_model.predict(X_test_selected)

# Оценка качества модели
selected_mape = mean_absolute_percentage_error(y_test, y_pred_selected)
print(f'MAPE на выбранных признаках: {selected_mape}')

MAPE на выбранных признаках: 0.2683246618468463


In [42]:
df_small = df.drop('price', axis=1)

In [43]:
df_small = df_small.loc[:, selected_features]

In [44]:
df_small['price'] = df['price']

In [45]:
df_small.to_csv('../../data/processed/data_small.csv')

In [48]:
X = df.drop(columns=['price'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = scaler_std.fit_transform(X_train)
X_test = scaler_std.transform(X_test)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Baseline MAPE: {mape}')

Baseline MAPE: 0.3246299930186311


In [47]:
df.to_csv('../../data/processed/data.csv')