In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Tabular Playground Series - Sep 2021 : AutoML + PCA
</h1>
</div>
</div>

<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/25226/logos/header.png?t=2021-01-27-17-34-31" alt="">

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Install MLJAR AutoML
</h1>
</div>
</div>

<img src="https://raw.githubusercontent.com/mljar/mljar-examples/master/media/AutoML_overview_mljar_v3.svg" alt="" width = '700'>

**Documentation: https://supervised.mljar.com/**

**Source Code: https://github.com/mljar/mljar-supervised**

The **mljar-supervised** is an Automated Machine Learning Python package that works with tabular data.  
It is designed to save time for a data scientist. It abstracts the common way to preprocess the data,   
construct the machine learning models, and perform hyper-parameters tuning to find the best model

source : https://github.com/mljar/mljar-supervised

In [None]:
!pip install -q -U git+https://github.com/mljar/mljar-supervised.git@master

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Check the dataset!
</h1>
</div>
</div>

In [None]:
import pandas as pd
pd_train =pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
pd_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

print("train data : ",len(pd_train))
print("test data : ",len(pd_test))
pd_train.head()

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Data Slicing
</h1>
</div>
</div>

**Slice Data into x_train for train data, y_train for target, x_test for test data.**

In [None]:
x_train = pd_train.iloc[:,1:-1]
y_train = pd_train.iloc[:,-1]
x_test = pd_test.iloc[:,1:]

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
EDA using MLJAR AutoML
</h1>
</div>
</div>

In [None]:
from supervised.preprocessing.eda import EDA

EDA.extensive_eda(x_train, y_train, save_path = './')

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Filling NaN
</h1>
</div>
</div>

**We are going to fill NaN datas using Scikit-learn's SimpleImputer.**

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train = imp_mean.fit_transform(x_train)
x_test = imp_mean.transform(x_test)

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Principal Component Analysis : PCA
</h1>
</div>
</div>

<img src="https://miro.medium.com/max/2000/1*KdvxqXIOkb9JY_BeUWvpxg.jpeg" alt="" width = '1000'>

**We are going to use PCA to reduce dimension.**

**Purpose of Dimensionality Reduction**
- Data Compression
- Data Visualization
- Troubleshooting Curse in Dimensions
- Finding Important Features

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Training using MLJAL AutoML
</h1>
</div>
</div>

In [None]:
from supervised.automl import AutoML

automl = AutoML(
    mode="Compete",
    algorithms=["CatBoost", "Xgboost", "LightGBM","Neural Network"],
    start_random_models=10,
    total_time_limit=1800,
    train_ensemble=True,
    eval_metric="auc"
)
automl.fit(x_train, y_train)

In [None]:
automl.report()

In [None]:
pred = automl.predict_all(x_test)
pred.head()

In [None]:
sub = pd.DataFrame({"id": pd_test.id, "claim":pred["prediction_1"]})
sub.to_csv("submission.csv", index=False)
sub.head