In [7]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [8]:
train['store_id'] = pd.factorize(train['store_sqft'])[0] + 1

# *product_id = нумерация уникальных gross_weight&"_"&recyclable_package&low_fat&units_per_case
train['product_id'] = pd.factorize(
    train['gross_weight'].astype(str) + "_" +
    train['recyclable_package'].astype(str) + "_" +
    train['low_fat'].astype(str) + "_" +
    train['units_per_case'].astype(str))[0] + 1

# *campain_id = нумерация уникальных cost
#train['campain_id'] = pd.factorize(train['cost'])[0] + 1

# *client_profile_id = нумерация уникальных *store_id&"_"&total_children&num_children_at_home&avg_cars_at home(approx),1
train['client_profile_id'] = pd.factorize(
    train['store_id'].astype(str) + "_" +
    train['total_children'].astype(str) + "_" +
    train['num_children_at_home'].astype(str) + "_" +
    train['avg_cars_at home(approx).1'].astype(str))[0] + 1

# *total_sales_in_mln = max для store_sales(in millions) с группировкой по *product_id
total_sales_in_mln = train.groupby('product_id')['store_sales(in millions)'].transform('max')
train['total_sales_in_mln'] = total_sales_in_mln

# *total_sales_in_units = max для unit_sales(in millions) с группировкой по *product_id
total_sales_in_units = train.groupby('product_id')['unit_sales(in millions)'].transform('max')
train['total_sales_in_units'] = total_sales_in_units

In [10]:
test['store_id'] = pd.factorize(test['store_sqft'])[0] + 1

# *product_id = нумерация уникальных gross_weight&"_"&recyclable_package&low_fat&units_per_case
test['product_id'] = pd.factorize(
    test['gross_weight'].astype(str) + "_" +
    test['recyclable_package'].astype(str) + "_" +
    test['low_fat'].astype(str) + "_" +
    test['units_per_case'].astype(str))[0] + 1

# *campain_id = нумерация уникальных cost
#train['campain_id'] = pd.factorize(train['cost'])[0] + 1

# *client_profile_id = нумерация уникальных *store_id&"_"&total_children&num_children_at_home&avg_cars_at home(approx),1
test['client_profile_id'] = pd.factorize(
    test['store_id'].astype(str) + "_" +
    test['total_children'].astype(str) + "_" +
    test['num_children_at_home'].astype(str) + "_" +
    test['avg_cars_at home(approx).1'].astype(str))[0] + 1

# *total_sales_in_mln = max для store_sales(in millions) с группировкой по *product_id
total_sales_in_mln = test.groupby('product_id')['store_sales(in millions)'].transform('max')
test['total_sales_in_mln'] = total_sales_in_mln

# *total_sales_in_units = max для unit_sales(in millions) с группировкой по *product_id
total_sales_in_units = test.groupby('product_id')['unit_sales(in millions)'].transform('max')
test['total_sales_in_units'] = total_sales_in_units

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360336 entries, 0 to 360335
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          360336 non-null  int64  
 1   store_sales(in millions)    360336 non-null  float64
 2   unit_sales(in millions)     360336 non-null  float64
 3   total_children              360336 non-null  float64
 4   num_children_at_home        360336 non-null  float64
 5   avg_cars_at home(approx).1  360336 non-null  float64
 6   gross_weight                360336 non-null  float64
 7   recyclable_package          360336 non-null  float64
 8   low_fat                     360336 non-null  float64
 9   units_per_case              360336 non-null  float64
 10  store_sqft                  360336 non-null  float64
 11  coffee_bar                  360336 non-null  float64
 12  video_store                 360336 non-null  float64
 13  salad_bar     

In [None]:
scaler = StandardScaler()

X = train.drop(['cost', 'id'], axis=1)
y_train = train['cost']

X_train = scaler.fit_transform(X)
X_test = scaler.transform(test.drop(['id'], axis=1))

model = RandomForestRegressor(
    random_state=42,
    max_depth=15,
    min_samples_leaf=4,
    min_samples_split=15,
    n_estimators=200,
    bootstrap=True,
)

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
rmse_train = mean_squared_error(y_train, y_train_pred)

print(f"R^2 train: {r2_train:.4f}")
print(f"RMSE train: {rmse_train:.4f}")

R^2 train: 0.2325
RMSE train: 688.0046


In [13]:
submission = pd.DataFrame(
    {
        'id': test['id'], 
        'cost': y_test_pred
    })

submission.to_csv('submission.csv', index=False)