# What I found About the data in this Notebook:
1. Data doesn't have any missing values
2. Data is not correlated with each other nor with **target**

## What does this notebook cover:
1. EDA
2. Size reduction of dataframe
3. Tried LightAutoML

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
test_df= pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
sub_df = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')
df= pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
display(df.sample(5))
print('shape of training dataset: ',df.shape)
print('shape of test dataset:     ',test_df.shape)
print('shape of sub sample:       ',sub_df.shape)

## lets check the **distribution of target** feature

In [None]:
df.target.hist();

Target is almost equally distributed!!!

In [None]:
df.describe().T

## Observation
1. All feature have -ve value in min except target and id.
2. Mean is mostly in single digit for all features with 2 exceptions (f2: 306 & f35: 55)
3. STD is mostly single digit or less except f2 and f35

In [None]:
df.info()

In [None]:
sns.heatmap(df.isnull(), cmap='Blues')

We don't having any missing data

## Reducing memory usage on disk/Ram

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
df = reduce_mem_usage(df)
test_df = reduce_mem_usage(test_df)

# Visualizing the data

In [None]:
%%time
df.hist(figsize=(22,28))
plt.show()

In [None]:
%%time
df.corr()

### Data is not Correlated
Data isn't correlated to target or other features at all

# Modeling with AutoMl

Model credit goes to [This Notebook]('https://www.kaggle.com/alexryzhkov/lightautoml-november-21')

In [None]:
!pip install -U lightautoml

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [None]:
%%time
task = Task('binary')
automl = TabularAutoML(task = task, timeout = 8 * 3600, cpu_limit = 4, 
                       general_params = {'use_algos': [['cb']]}, 
                       selection_params = {'mode': 0})
oof_pred = automl.fit_predict(df, roles = {'target': 'target', 'drop': ['id']}, verbose = 2)
sub_df['target'] = automl.predict(test_df).data[:, 0]

In [None]:
sub_df.to_csv('sub_light_AutoML.csv', index = False)