In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from scipy import stats
from sklearn.preprocessing import QuantileTransformer

plt.style.use('ggplot')

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(train_data.id, train_data.target)
plt.title('Target against ID')

ID никак не зависит от таргета, поэтому его можем спокойно не использовать в дальнейших проверках.

In [None]:
train_cols = [col for col in list(train_data) if col != 'id']
test_cols = train_cols.copy()
test_cols.pop(-1)

In [None]:
train_data.describe()

In [None]:
test_data.describe()

Все признаки обладают одинаковыми шкалами, примерно одинаковые min, max в train и test.

In [None]:
train_data[train_cols].hist(figsize=(20,20), bins=100)
plt.show()

In [None]:
test_data[test_cols].hist(figsize=(20,20), bins=100)
plt.show()

Проверим распределение каждого из признаков в train и test на идентиченость распределений:

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(20,20))

for i, col in enumerate(test_cols):
    sns.kdeplot(train_data[col], ax=axes[i//4, i%4])
    sns.kdeplot(test_data[col], ax=axes[i//4, i%4])

In [None]:
plt.figure(figsize=(20,20))
sns.pairplot(train_data, diag_kind='kde')
plt.show()

Видно, что есть выброс у target-a. В остальном каких-то явных корреляций между признаками я не вижу.

In [None]:
train_data = train_data[train_data['target'] > 0]

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(train_data[train_cols].corr(), annot=True, square=True)
plt.show()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(test_data[test_cols].corr(), annot=True, square=True)
plt.show()

Видно явно скорелированные пары признаков.

In [None]:
plt.figure(figsize=(15,7))
sns.distplot(train_data['cont1'])

In [None]:
train_data['cont1_level1'] = train_data['cont1'].apply(lambda x: int(x <= 0.41))
train_data['cont1_level2'] = train_data['cont1'].apply(lambda x: int(0.41 < x <= 0.57))
train_data['cont1_level3'] = train_data['cont1'].apply(lambda x: int(0.57 < x <= 0.78))
train_data['cont1_level4'] = train_data['cont1'].apply(lambda x: int(x > 0.78))

train_data['cont5_level1'] = train_data['cont5'].apply(lambda x: int(x < 0.29))

train_data['cont14_level1'] = train_data['cont14'].apply(lambda x: int(x < 0.53))
train_data['cont13_level1'] = train_data['cont13'].apply(lambda x: int(x < 0.41))

In [None]:
plt.figure(figsize=(15,7))
sns.distplot(train_data['cont13_level1'])
plt.show()

In [None]:
new_cols = ['cont1_level1', 'cont1_level2', 'cont1_level3', 'cont1_level4', 'cont5_level1', 'cont13_level1', 'cont14_level1']

In [None]:
transformer = QuantileTransformer()

for col in train_cols:
    new_col = 'quantile_' + col
    train_data[new_col] = transformer.fit_transform(np.array(train_data[col]).reshape(-1,1))

In [None]:
train_cols = list(train_data)
train_cols.pop(-15)
train_cols.pop(0)

In [None]:
train_cols += new_cols

In [None]:
x = np.array(train_data[train_cols])
y = np.array(train_data['target'])

In [None]:
x = x[:10000]
y = y[:10000]

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(train_data[train_cols].corr(), annot=True, square=True)
plt.show()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

pipe = make_pipeline(StandardScaler(), 
                     PolynomialFeatures(), 
                     SelectFromModel(estimator=Lasso(alpha=0.01)), 
                     RandomForestRegressor())

In [None]:
cross_val_score(pipe, x_train, y_train, scoring=make_scorer(mean_squared_error), cv=3, n_jobs=-1, verbose=True)

In [None]:
np.sqrt(np.array([0.515116  , 0.51311002, 0.53723786]).mean())

In [None]:
cross_val_score(pipe, x_train, y_train, scoring=make_scorer(mean_squared_error), cv=3, n_jobs=-1, verbose=True)

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

rf = RandomForestRegressor(random_state=42)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)
print('RMSE = ', np.sqrt(mean_squared_error(y_pred, y_test)))

In [None]:
cross_val_score(rf, x_train, y_train, scoring=make_scorer(mean_squared_error), cv=3, n_jobs=-1, verbose=True)

In [None]:
sorted(list(zip(train_cols, rf.feature_importances_)), key=lambda x: x[1], reverse=True)

In [None]:
lasso = Lasso(alpha=0.01, random_state=42)
lasso.fit(x_train, y_train)

y_pred = lasso.predict(x_test)
print('RMSE = ', np.sqrt(mean_squared_error(y_pred, y_test)))

In [None]:
sorted(list(zip(train_cols, lasso.coef_)), key=lambda x: x[1], reverse=True)