# Лабораторная работа №2. Линейная регрессия
Выполнил Ширяев Н. А. группа М8О-308Б-22

## Загрузка данных

In [60]:
import numpy as np
import pandas as pd
import os


train = pd.read_csv("data/train.csv").replace("-", np.nan)
test = pd.read_csv("data/test.csv").replace("-", np.nan)
target_col = "source_attractiveness" 

print("Train columns:", train.columns)
print("Test columns:", test.columns)

Train columns: Index(['Unnamed: 0', 'category', 'clicks', 'likes', 'buys', '4xx_errors',
       '5xx_errors', 'complaints_count', 'average_dwelltime',
       'source_attractiveness', 'date_of_registration'],
      dtype='object')
Test columns: Index(['ID', 'category', 'clicks', 'likes', 'buys', '4xx_errors', '5xx_errors',
       'complaints_count', 'average_dwelltime', 'date_of_registration'],
      dtype='object')


## Анализ данных

In [61]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots


# Проверим пропуски
print("Train missing values:\n", train.isna().sum())
print("Test missing values:\n", test.isna().sum())

# Посмотрим распределение целевой переменной
fig = go.Figure()
fig.add_trace(go.Histogram(x=train[target_col], nbinsx=50, name='Target Distribution'))
fig.update_layout(title='Распределение целевой переменной', xaxis_title=target_col, yaxis_title='Count')
fig.show()

numeric_features = ['clicks', 'likes', 'buys', '4xx_errors', '5xx_errors', 'complaints_count', 'average_dwelltime']
fig = make_subplots(rows=2, cols=4, subplot_titles=numeric_features)
row, col = 1, 1
for f in numeric_features:
    fig.add_trace(go.Histogram(x=train[f], name=f, nbinsx=50), row=row, col=col)
    col += 1
    if col > 4:
        col = 1
        row += 1
fig.update_layout(title_text="Распределения числовых признаков")

Train missing values:
 Unnamed: 0                 0
category                   0
clicks                   273
likes                    376
buys                       0
4xx_errors                 0
5xx_errors                 0
complaints_count         884
average_dwelltime          0
source_attractiveness      0
date_of_registration       0
dtype: int64
Test missing values:
 ID                        0
category                  0
clicks                   70
likes                    87
buys                      0
4xx_errors                0
5xx_errors                0
complaints_count        220
average_dwelltime         0
date_of_registration      0
dtype: int64


## Подготовка и очистка данных

In [62]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


train.drop(columns=['date_of_registration', 'Unnamed: 0'], inplace=True)
test.drop(columns=['date_of_registration', 'ID'], inplace=True)

X = train.drop(columns=[target_col])
y = train[target_col]

categorical_features = ['category']
numeric_features = [c for c in X.columns if c not in categorical_features]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

## Обучение и тестирование моделей

In [63]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print("Validation MSE:", mse)

# Если результат удовлетворяет условиям (< 0.02), обучаем на всём train
model.fit(X, y)
test_preds = model.predict(test)

Validation MSE: 0.013824563315977994


## Submit

In [64]:
submission = pd.DataFrame({"ID": test.index, "prediction": test_preds})
submission.to_csv("submission.csv", index=False)