In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Mercedes-Benz Greener Manufacturing

In [None]:
!pip install -U lightautoml

# Other libraries

In [None]:
# Standard python libraries
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import torch

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import norm, skew #for some statistics

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

# Constants

In [None]:
N_THREADS = 4 #количество VCPU для создания модели LightAutoML
N_FOLDS = 5 #фолды
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 600
TARGET_NAME = 'y'

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Data loading

In [None]:
%%time

train_data = pd.read_csv('/kaggle/input/mercedes-benz-greener-manufacturing/train.csv.zip')
test_data = pd.read_csv("/kaggle/input/mercedes-benz-greener-manufacturing/test.csv.zip")
submission = pd.read_csv("/kaggle/input/mercedes-benz-greener-manufacturing/sample_submission.csv.zip")

In [None]:
%%time

train_data.shape, test_data.shape, submission.shape

In [None]:
train_data.head()

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(range(train_data.shape[0]),np.sort(train_data.y.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.show()

Видим 1 выбивающееся значение. Построим график распределения

In [None]:
ulimit = 180
train_data['y'].iloc[train_data['y']>ulimit] = ulimit

plt.figure(figsize=(12,8))
sns.distplot(train_data.y.values, bins=50, kde=False)
plt.xlabel('y value', fontsize=12)
plt.show()

Теперь давайте посмотрим на тип данных всех переменных, присутствующих в наборе данных

In [None]:
dtype_data=train_data.dtypes.reset_index()
dtype_data.columns = ["Count", "Column Type"]
dtype_data.groupby("Column Type").aggregate('count').reset_index()

Таким образом, большинство столбцов являются целыми числами с 8 категориальными столбцами и 1 столбцом с плавающей запятой (целевая переменная)

In [None]:
dtype_data.loc[:10,:]

In [None]:
train_data.isnull().sum().sum()

In [None]:
one_value_cols = [col for col in train_data.columns if train_data[col].nunique() <= 1]
print(f'There are {len(one_value_cols)} columns in train dataset with one unique value.')

In [None]:
one_value_cols

In [None]:
train_data.drop(one_value_cols, axis=1, inplace=True)

In [None]:
test_data.drop(one_value_cols, axis=1, inplace=True)

In [None]:
one_value_cols_test = [col for col in test_data.columns if test_data[col].nunique() <= 1]
print(f'There are {len(one_value_cols_test)} columns in test dataset with one unique value.')

In [None]:
train_data.drop(one_value_cols_test, axis=1, inplace=True)
test_data.drop(one_value_cols_test, axis=1, inplace=True)

**Рассмотрим категориальные данные**

In [None]:
var="X0"
colu_order=np.sort(train_data[var].unique()).tolist()
plt.figure(figsize=(12,6))
sns.stripplot(x=var,y="y",data=train_data,order=colu_order)
plt.xlabel(var,fontsize=12)
plt.ylabel("y",fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

In [None]:
var="X1"
colu_order=np.sort(train_data[var].unique()).tolist()
plt.figure(figsize=(12,6))
sns.stripplot(x=var,y="y",data=train_data,order=colu_order)
plt.xlabel(var,fontsize=12)
plt.ylabel("y",fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

In [None]:
var="X2"
colu_order=np.sort(train_data[var].unique()).tolist()
plt.figure(figsize=(12,6))
sns.boxplot(x=var,y="y",data=train_data,order=colu_order)
plt.xlabel(var,fontsize=12)
plt.ylabel("y",fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

In [None]:
var="X3"
colu_order=np.sort(train_data[var].unique()).tolist()
plt.figure(figsize=(12,6))
sns.violinplot(x=var,y="y",data=train_data,order=colu_order)
plt.xlabel(var,fontsize=12)
plt.ylabel("y",fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

In [None]:
var="X4"
colu_order=np.sort(train_data[var].unique()).tolist()
plt.figure(figsize=(12,6))
sns.violinplot(x=var,y="y",data=train_data,order=colu_order)
plt.xlabel(var,fontsize=12)
plt.ylabel("y",fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

In [None]:
var="X5"
colu_order=np.sort(train_data[var].unique()).tolist()
plt.figure(figsize=(12,6))
sns.boxplot(x=var,y="y",data=train_data,order=colu_order)
plt.xlabel(var,fontsize=12)
plt.ylabel("y",fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

In [None]:
var="X6"
colu_order=np.sort(train_data[var].unique()).tolist()
plt.figure(figsize=(12,6))
sns.boxplot(x=var,y="y",data=train_data,order=colu_order)
plt.xlabel(var,fontsize=12)
plt.ylabel("y",fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

In [None]:
var="X8"
colu_order=np.sort(train_data[var].unique()).tolist()
plt.figure(figsize=(12,6))
sns.boxplot(x=var,y="y",data=train_data,order=colu_order)
plt.xlabel(var,fontsize=12)
plt.ylabel("y",fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

**Бинарные данные**

In [None]:
unique_value_dict = {}
for col in train_data.columns:
    if col not in ["ID", "y", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
        unique_value = str(np.sort(train_data[col].unique()).tolist())
        t_list = unique_value_dict.get(unique_value, [])
        t_list.append(col)
        unique_value_dict[unique_value] = t_list[:]
for unique_val, columns in unique_value_dict.items():
    print("Columns containing the unique values : ",unique_val)
    print(columns)
    print("-----------------------------------------------------------")

In [None]:
zero_list=[]
one_list=[]
col_list = unique_value_dict['[0, 1]']
for col in col_list:
    zero_list.append((train_data[col]==0).sum())
    one_list.append((train_data[col]==1).sum())
l = len(col_list)
arr = np.arange(l)
width = 0.35
plt.figure(figsize=(6,100))
plot_1 = plt.barh(arr, zero_list, width, color='red')
plot_2 = plt.barh(arr, one_list, width, left=zero_list, color="blue")
plt.yticks(arr, col_list)
plt.legend((plot_1[0], plot_2[0]), ('Zero count', 'One Count'))
plt.show()

In [None]:
var = "ID"
plt.figure(figsize=(12,6))
sns.regplot(x=var, y='y', data=train_data, scatter_kws={'alpha':0.5, 's':30})
plt.xlabel(var, fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title("Distribution of y variable with "+var, fontsize=15)
plt.show()

In [None]:
train_data = train_data.drop_duplicates()

In [None]:
sns.boxplot((train_data.y))

In [None]:
# удаляем выбросы
train_data = train_data[(train_data['y'] <= 136)].reset_index(drop=True)

In [None]:
sns.boxplot((train_data.y))

In [None]:
sns.distplot(train_data['y'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train_data['y'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train_data['y'], plot=plt)
plt.show()

# Features

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

In [None]:
def get_additional_features(train,test,magic=False,ID=False):
    col = list(test.columns)
    if ID!=True:
        col.remove('ID')
    n_comp = 12
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results_train = tsvd.fit_transform(train[col])
    tsvd_results_test = tsvd.transform(test[col])
    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca2_results_train = pca.fit_transform(train[col])
    pca2_results_test = pca.transform(test[col])
    # ICA
    ica = FastICA(n_components=n_comp, random_state=420)
    ica2_results_train = ica.fit_transform(train[col])
    ica2_results_test = ica.transform(test[col])
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
    grp_results_train = grp.fit_transform(train[col])
    grp_results_test = grp.transform(test[col])
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
    srp_results_train = srp.fit_transform(train[col])
    srp_results_test = srp.transform(test[col])
    for i in range(1, n_comp + 1):
        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]
        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]
        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]
        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    if magic==True:
        magic_mat = train[['ID','X0','y']]
        magic_mat = magic_mat.groupby(['X0'])['y'].mean()
        magic_mat = pd.DataFrame({'X0':magic_mat.index,'magic':list(magic_mat)})
        mean_magic = magic_mat['magic'].mean()
        train = train.merge(magic_mat,on='X0',how='left')
        test = test.merge(magic_mat,on='X0',how = 'left')
        test['magic'] = test['magic'].fillna(mean_magic)
    return train,test

In [None]:
def feature_creation(df):
    for i in ['X0', 'X1', 'X2', 'X3', 'X5', 'X6', 'X8']:
        for j in ['X0', 'X1', 'X2', 'X3', 'X5', 'X6', 'X8']:
            df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')

    return df

In [None]:
test_data = feature_creation(test_data)
train_data = feature_creation(train_data)

In [None]:
train_data,test_data = get_additional_features(train,test,magic=True)

In [None]:
train_data.shape, test_data.shape, submission.shape

In [None]:
train_data.head()

# Data splitting for train

In [None]:
%%time
#В этой версии уже не использовала, т.к. уже предварительно проверила
# что за полное время результаты лучше и сразу учу на всей дате

tr_data, te_data = train_test_split(train_data, 
                                    test_size=TEST_SIZE, 
                                    #stratify=train_data[TARGET_NAME], 
                                    random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(tr_data.shape, te_data.shape))

# AutoML

In [None]:
task = Task('reg', loss = 'rmsle', metric = 'rmsle')

In [None]:
%%time

roles = {
    'target': TARGET_NAME,
    'drop': ['ID'],
}

In [None]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS})

oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(r2_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0]) ** 0.5))

# Prepare submission

In [None]:
submission[TARGET_NAME] = test_pred.data[:, 0]

In [None]:
submission.to_csv('auto_ml_mercedes2.csv', index = False)