In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv', index_col = 0)
test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv', index_col = 0)
submit = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print('Dataframe shapes:', "train :",train.shape, "test :" ,test.shape)

In [None]:
train.info()

In [None]:
train.duplicated().sum()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
plt.title('Target distribution')
ax = sns.countplot(x=train['target'], data=train)
ax.set_facecolor('white')

In [None]:
tabcorr = train.corr() 
plt.figure(figsize=(15,15))
sns.heatmap(abs(tabcorr), cmap="coolwarm")

In [None]:
float_features = [f for f in train.columns if train[f].dtype == 'float64']

# Training histograms
fig,axs  = plt.subplots(4, 4, figsize=(16, 16))
for f, ax in zip(float_features, axs.ravel()):
    ax.hist(train[f], density=True, bins=100)
    ax.set_title(f'Train {f}, std={train[f].std():.1f}')
    ax.set_facecolor('white')
plt.suptitle('Histograms of the float features', y=0.93, fontsize=20)
plt.show()


In [None]:
float_features = [f for f in train.columns if train[f].dtype == 'float64']

# Correlation matrix of the float features
plt.figure(figsize=(12, 12))
sns.heatmap(train[float_features + ['target']].corr(), center=0, annot=True, fmt='.2f', cmap="coolwarm")
plt.show()


In [None]:
# Plot dependence between every feature and the target
def plot_mutual_info_diagram(df, features, ncols=4, by_quantile=True, mutual_info=True,
                             title='How the target probability depends on single features'):
    def H(p):
        """Entropy of a binary random variable in nat"""
        return -np.log(p) * p - np.log(1-p) * (1-p)
                 
    nrows = (len(features) + ncols - 1) // ncols
    fig, axs = plt.subplots(nrows, ncols, figsize=(16, nrows*4), sharey=True)
    for f, ax in zip(features, axs.ravel()):
        temp = pd.DataFrame({f: df[f].values,
                             'state': df.target.values})
        temp = temp.sort_values(f)
        temp.reset_index(inplace=True)
        rolling_mean = temp.state.rolling(15000, center=True, min_periods=1).mean()
        if by_quantile:
            ax.scatter(temp.index, rolling_mean, s=2)
        else:
            ax.scatter(temp[f], rolling_mean, s=2)
        if mutual_info and by_quantile:
            ax.set_xlabel(f'{f} mi={H(temp.state.mean()) - H(rolling_mean[~rolling_mean.isna()].values).mean():.5f}')
        else:
            ax.set_xlabel(f'{f}')
        ax.set_facecolor('white')
    plt.suptitle(title, y=0.90, fontsize=20)
    plt.show()

plot_mutual_info_diagram(train, float_features,
                         title='How the target probability depends on the float features')

In [None]:
from matplotlib.ticker import MaxNLocator

In [None]:
int_features = [f for f in train.columns if train[f].dtype == 'int64']

# Correlation matrix of the float features
plt.figure(figsize=(12, 12))
sns.heatmap(train[int_features + ['target']].corr(), center=0, annot=True, fmt='.2f',cmap="coolwarm")
plt.show()

In [None]:
int_features = [f for f in test.columns if test[f].dtype == 'int64' and f != 'id']

# Training histograms
#fig, axs = plt.subplots(4, 4, figsize=(16, 16))
figure = plt.figure(figsize=(16, 16))
# for f, ax in zip(int_features, axs.ravel()):
for i, f in enumerate(int_features):
    plt.subplot(4, 4, i+1)
    ax = plt.gca()
    vc = train[f].value_counts()
    ax.bar(vc.index, vc)
    #ax.hist(train[f], density=False, bins=(train[f].max()-train[f].min()+1))
    ax.set_xlabel(f'Train {f}')
    ax.set_facecolor('white')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True)) # only integer labels
plt.suptitle('Histograms of the integer features', y=1.0, fontsize=20)
figure.tight_layout(h_pad=1.0)
plt.show()


In [None]:
plot_mutual_info_diagram(train, int_features,
                         title='How the target probability depends on the int features')


In [None]:
train.head()

In [None]:
train.f_27.str.len().min()

In [None]:
train.f_27.value_counts()

In [None]:
for i in range(10):
    print(f'Position {i}')
    tg = train.groupby(train.f_27.str.get(i))
    temp = pd.DataFrame({'size': tg.size(), 'probability': tg.target.mean().round(2)})
    print(temp)
    print()


In [None]:
unique_characters = train.f_27.apply(lambda s: len(set(s))).rename('unique_characters')
tg = train.groupby(unique_characters)
temp = pd.DataFrame({'size': tg.size(), 'probability': tg.target.mean().round(2)})
print(temp)

In [None]:
# From https://www.kaggle.com/ambrosm/tpsmay22-eda-which-makes-sense
for df in [train, test]:
    for i in range(10):
        df[f'ch{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))


In [None]:
plot_mutual_info_diagram(train, 
                         [f for f in train.columns if f.startswith('ch')] + ['unique_characters'],
                         title='How the target probability depends on the character features')

In [None]:
from matplotlib.colors import ListedColormap

In [None]:
plt.rcParams['axes.facecolor'] = 'k'
plt.figure(figsize=(11, 5))
cmap1 = ListedColormap(["#ffd700", "#ffd700"])


ax = plt.subplot(1, 3, 1)
ax.scatter(train['f_02'], train['f_21'], s=1,
           c=train.target, cmap=cmap1)
ax.set_xlabel('f_02')
ax.set_ylabel('f_21')
ax.set_aspect('equal')
ax0 = ax

cmap = ListedColormap(["#ffd700", "#0057b8"])
# target == 0 → yellow; target == 1 → blue
ax = plt.subplot(1, 3, 2)
ax.scatter(train['f_02'], train['f_21'], s=1,
           c=train.target, cmap=cmap)
ax.set_xlabel('f_02')
ax.set_ylabel('f_21')
ax.set_aspect('equal')
ax0 = ax

plt.tight_layout(w_pad=1.0)
plt.savefig('three-projections.png')
plt.show()
plt.rcParams['axes.facecolor'] = '#0057b8'

In [None]:
plt.rcParams['axes.facecolor'] = 'k'
plt.figure(figsize=(11, 5))
cmap = ListedColormap(["#ffd700", "#0057b8"])
# target == 0 → yellow; target == 1 → blue

ax = plt.subplot(1, 3, 1)
ax.scatter(train['f_02'], train['f_21'], s=1,
           c=train.target, cmap=cmap)
ax.set_xlabel('f_02')
ax.set_ylabel('f_21')
ax.set_aspect('equal')
ax0 = ax

ax = plt.subplot(1, 3, 2, sharex=ax0, sharey=ax0)
ax.scatter(train['f_05'], train['f_22'], s=1,
           c=train.target, cmap=cmap)
ax.set_xlabel('f_05')
ax.set_ylabel('f_22')
ax.set_aspect('equal')

ax = plt.subplot(1, 3, 3, sharex=ax0, sharey=ax0)
ax.scatter(train['f_00'] + train['f_01'], train['f_26'], s=1,
           c=train.target, cmap=cmap)
ax.set_xlabel('f_00 + f_01')
ax.set_ylabel('f_26')
ax.set_aspect('equal')

plt.tight_layout(w_pad=1.0)
plt.savefig('three-projections.png')
plt.show()
plt.rcParams['axes.facecolor'] = '#0057b8' # blue

In [None]:
for df in [train, test]:
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)

In [None]:
train.info()

In [None]:
#train = train.drop('f_27', axis = 1)
#train = pd.get_dummies(train, columns=["ch0","ch1","ch2","ch3","ch4","ch5","ch6","ch7","ch8","ch9"])

In [None]:
#test = test.drop('f_27', axis = 1)
#test = pd.get_dummies(test, columns=["ch0","ch1","ch2","ch3","ch4","ch5","ch6","ch7","ch8","ch9"])

In [None]:
train_c = list(train)
test_c = list(test)

In [None]:
train.info()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train_c = list(train)
test_c = list(test)
print(set(train_c)-set(test_c)-set(["target"]))
print(set(test_c)-set(train_c)-set(["target"]))

In [None]:
for i in list(set(train_c)-set(test_c)-set(["target"])):
    test[i] = np.uint8(0)

In [None]:
for i in list(set(test_c)-set(train_c)-set(["target"])):
    train[i] = np.uint8(0)

In [None]:
print(set(train_c)-set(test_c)-set(["target"]))
print(set(test_c)-set(train_c)-set(["target"]))

In [None]:
train.info()

In [None]:
test.info()

In [None]:
X = train.drop('target', axis = 1)
y = train['target']

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
!pip install mljar-supervised

In [None]:
from supervised.automl import AutoML 


In [None]:
y_train = train["target"]
X_train = train.drop(["target"], axis=1)

In [None]:
automl = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM"],
    model_time_limit=10*24**60*60,
    start_random_models=10,
    hill_climbing_steps=3,
    top_models_to_improve=4,
    golden_features=True,
    features_selection=True,
    stack_models=True,
    train_ensemble=True,
    boost_on_errors=True,
    kmeans_features=True,
    explain_level=2,
    max_single_prediction_time=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 12,
        "shuffle": True,
        "stratify": True,
    }
)


In [None]:
automl.fit(X_train, y_train)

In [None]:
automl.report()

In [None]:
test.head()

In [None]:
predictions = automl.predict(test)

In [None]:
submit["target"]= predictions

In [None]:
submit.to_csv("submission.csv",index=False)