In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [2]:
train_df = pd.read_csv('content/train.csv')
test_df = pd.read_csv('content/test.csv')

## 1. Описание признаков

In [23]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360336 entries, 0 to 360335
Data columns (total 17 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          360336 non-null  int64  
 1   store_sales(in millions)    360336 non-null  float64
 2   unit_sales(in millions)     360336 non-null  float64
 3   total_children              360336 non-null  float64
 4   num_children_at_home        360336 non-null  float64
 5   avg_cars_at home(approx).1  360336 non-null  float64
 6   gross_weight                360336 non-null  float64
 7   recyclable_package          360336 non-null  float64
 8   low_fat                     360336 non-null  float64
 9   units_per_case              360336 non-null  float64
 10  store_sqft                  360336 non-null  float64
 11  coffee_bar                  360336 non-null  float64
 12  video_store                 360336 non-null  float64
 13  salad_bar     

In [24]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240224 entries, 0 to 240223
Data columns (total 16 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          240224 non-null  int64  
 1   store_sales(in millions)    240224 non-null  float64
 2   unit_sales(in millions)     240224 non-null  float64
 3   total_children              240224 non-null  float64
 4   num_children_at_home        240224 non-null  float64
 5   avg_cars_at home(approx).1  240224 non-null  float64
 6   gross_weight                240224 non-null  float64
 7   recyclable_package          240224 non-null  float64
 8   low_fat                     240224 non-null  float64
 9   units_per_case              240224 non-null  float64
 10  store_sqft                  240224 non-null  float64
 11  coffee_bar                  240224 non-null  float64
 12  video_store                 240224 non-null  float64
 13  salad_bar     

*Таблица 1 - Описание признаков исходного датасета*

|Признак|Описание|Тип данных|Категориальный|
|-------|--------|----------|--------------|
|`id`|Уникальный идентификатор для каждой записи|int64|Нет|
|`store_sales(in millions)`|Продажи в магазине в миллионах долларов| float64|Нет|
|`unit_sales(in millions)`|Объем продаж в миллионах единиц в магазине| float64|Нет|
|`total_children`|Общее количество детей|float64| Нет|
|`num_children_at_home`|Количество детей, проживающих дома, по данным, предоставленным клиентами|float64|Нет|
|`avg_cars_at home(approx)`|Среднее количество автомобилей|float64|Нет|
|`gross_weight`|Общий вес товара|float64|Нет|
|`recyclable_package`|Указывает, является ли упаковка товара перерабатываемой (0 или 1)|float64|Да|
|`low_fat`|Указывает, является ли продукт низкокалорийным (0 или 1)| float64|Да|
|`units_per_case`|Количество единиц в упаковке, доступных на полках магазина|float64|Нет|
|`store_sqft`|Площадь магазина в квадратных футах|float64|Нет|
|`coffee_bar`|Указывает, есть ли в магазине кофейня (0 или 1)|float64|Да|
|`video_store`|Указывает, есть ли в магазине видеомагазин или игровая зона (0 или 1)|float64|Да|
|`salad_bar`|Указывает, есть ли в магазине салат-бар (0 или 1)| float64| Да|
|`prepared_food`|Указывает, доступна ли в магазине готовая еда (0 или 1)| float64|Да|
|`florist`|Указывает, есть ли в магазине цветочный отдел (0 или 1)| float64|Да|
|`cost`|Стоимость привлечения клиента в долларах|float64|Нет|

**cost - целевая переменная (target)**

In [25]:
train_df.head(10)

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
0,0,8.61,3.0,2.0,2.0,2.0,10.3,1.0,0.0,32.0,36509.0,0.0,0.0,0.0,0.0,0.0,62.09
1,1,5.0,2.0,4.0,0.0,3.0,6.66,1.0,0.0,1.0,28206.0,1.0,0.0,0.0,0.0,0.0,121.8
2,2,14.08,4.0,0.0,0.0,3.0,21.3,1.0,0.0,26.0,21215.0,1.0,0.0,0.0,0.0,0.0,83.51
3,3,4.02,3.0,5.0,0.0,0.0,14.8,0.0,1.0,36.0,21215.0,1.0,0.0,0.0,0.0,0.0,66.78
4,4,2.13,3.0,5.0,0.0,3.0,17.0,1.0,1.0,20.0,27694.0,1.0,1.0,1.0,1.0,1.0,111.51
5,5,9.08,4.0,5.0,5.0,3.0,7.26,0.0,1.0,5.0,33858.0,1.0,0.0,1.0,1.0,1.0,142.58
6,6,4.8,2.0,1.0,0.0,2.0,9.58,0.0,0.0,6.0,27694.0,1.0,1.0,1.0,1.0,1.0,69.47
7,7,4.29,3.0,2.0,0.0,2.0,16.9,1.0,0.0,2.0,23688.0,1.0,1.0,1.0,1.0,1.0,68.84
8,8,8.55,3.0,5.0,0.0,2.0,13.8,1.0,0.0,6.0,38382.0,0.0,0.0,0.0,0.0,0.0,87.07
9,9,3.08,4.0,1.0,0.0,3.0,15.7,1.0,1.0,9.0,27694.0,1.0,1.0,1.0,1.0,1.0,80.29


In [26]:
train_df.describe()

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
count,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0
mean,180167.5,6.337376,3.043881,2.456482,0.68939,2.203813,13.822071,0.568086,0.327797,18.972706,28180.333442,0.564839,0.277394,0.504807,0.504832,0.503191,99.614729
std,104020.187637,3.30798,0.784676,1.488992,1.214732,1.084293,4.614792,0.495343,0.469411,10.212912,5968.874074,0.495779,0.447713,0.499978,0.499977,0.499991,29.939435
min,0.0,0.51,1.0,0.0,0.0,0.0,6.0,0.0,0.0,1.0,20319.0,0.0,0.0,0.0,0.0,0.0,50.79
25%,90083.75,3.72,3.0,1.0,0.0,1.0,9.71,0.0,0.0,10.0,23593.0,0.0,0.0,0.0,0.0,0.0,70.32
50%,180167.5,5.78,3.0,2.0,0.0,2.0,13.6,1.0,0.0,20.0,27694.0,1.0,0.0,1.0,1.0,1.0,98.81
75%,270251.25,8.4,4.0,4.0,1.0,3.0,17.7,1.0,1.0,28.0,33858.0,1.0,1.0,1.0,1.0,1.0,126.62
max,360335.0,22.92,6.0,5.0,5.0,4.0,21.9,1.0,1.0,36.0,39696.0,1.0,1.0,1.0,1.0,1.0,149.75


In [5]:
print(train_df.shape)
print(test_df.shape)

(360336, 17)
(240224, 16)


In [81]:
missing_values = train_df.isna().sum()

missing_values

id                            0
store_sales(in millions)      0
unit_sales(in millions)       0
total_children                0
num_children_at_home          0
avg_cars_at home(approx).1    0
gross_weight                  0
recyclable_package            0
low_fat                       0
units_per_case                0
store_sqft                    0
coffee_bar                    0
video_store                   0
salad_bar                     0
prepared_food                 0
florist                       0
cost                          0
dtype: int64

In [3]:
category_columns = ['recyclable_package', 'low_fat', 'coffee_bar', 'video_store', 'salad_bar', 'prepared_food', 'florist']
for column in category_columns:
    train_df[column] = train_df[column].astype(int)
    test_df[column] = test_df[column].astype(int)

train_df = pd.get_dummies(train_df, columns=category_columns, drop_first=True)
test_df = pd.get_dummies(test_df, columns=category_columns, drop_first=True)

train_df.head()

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,units_per_case,store_sqft,cost,recyclable_package_1,low_fat_1,coffee_bar_1,video_store_1,salad_bar_1,prepared_food_1,florist_1
0,0,8.61,3.0,2.0,2.0,2.0,10.3,32.0,36509.0,62.09,True,False,False,False,False,False,False
1,1,5.0,2.0,4.0,0.0,3.0,6.66,1.0,28206.0,121.8,True,False,True,False,False,False,False
2,2,14.08,4.0,0.0,0.0,3.0,21.3,26.0,21215.0,83.51,True,False,True,False,False,False,False
3,3,4.02,3.0,5.0,0.0,0.0,14.8,36.0,21215.0,66.78,False,True,True,False,False,False,False
4,4,2.13,3.0,5.0,0.0,3.0,17.0,20.0,27694.0,111.51,True,True,True,True,True,True,True


In [6]:
scaler = StandardScaler()
train_df_scaled = train_df.copy()
test_df_scaled = test_df.copy()

numeric_features = ['store_sales(in millions)', 'unit_sales(in millions)', 'total_children', 'num_children_at_home', 
                    'avg_cars_at home(approx).1', 'gross_weight', 'units_per_case', 'store_sqft']

train_df_scaled[numeric_features] = scaler.fit_transform(train_df[numeric_features])
test_df_scaled[numeric_features] = scaler.transform(test_df[numeric_features])

X = train_df_scaled.drop(columns=['id', 'cost'])
y = train_df_scaled['cost']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
train_df_scaled.head()

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,units_per_case,store_sqft,cost,recyclable_package_1,low_fat_1,coffee_bar_1,video_store_1,salad_bar_1,prepared_food_1,florist_1
0,0,0.687014,-0.055923,-0.306572,1.078931,-0.187968,-0.763214,1.275573,1.395352,62.09,True,False,False,False,False,False,False
1,1,-0.404289,-1.330336,1.036621,-0.567525,0.734293,-1.551983,-1.759805,0.0043,121.8,True,False,True,False,False,False,False
2,2,2.340593,1.21849,-1.649764,-0.567525,0.734293,1.620428,0.68808,-1.166944,83.51,True,False,True,False,False,False,False
3,3,-0.700542,-0.055923,1.708217,-0.567525,-2.032491,0.211912,1.667234,-1.166944,66.78,False,True,True,False,False,False,False
4,4,-1.271889,-0.055923,1.708217,-0.567525,0.734293,0.688641,0.100588,-0.081478,111.51,True,True,True,True,True,True,True


In [8]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print(mean_squared_log_error(y_val, y_pred))

0.10072959125172938


In [13]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_val)
print(mean_squared_log_error(y_val, y_pred_rf))

0.09591046974280769
