In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
When you see any dataset for the first time you always want to see as many relations as possible. You can do everything by hands, but I want to show libs that can make your life a bit easier.

Full dataset can be found here: https://www.kaggle.com/c/ventilator-pressure-prediction/data

# A bit of Feature Engineering

In [None]:
def add_features(df):
    #df = pd.get_dummies(df)
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['u_in_lag'] = df['u_in'].shift(1).fillna(0)
    df['u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['u_in_lag4'] = df['u_in'].shift(4).fillna(0)
    df['u_out_lag'] = df['u_out'].shift(1).fillna(0)
    df['time_lag'] = df['time_step'].shift(2).fillna(0)
    df['breath_id_lag']=df['breath_id'].shift(1).fillna(0)
    df['breath_id_lag2']=df['breath_id'].shift(2).fillna(0)
    df['breath_id_lagsame']=np.select([df['breath_id_lag']==df['breath_id']],[1],0)
    df['breath_id_lag2same']=np.select([df['breath_id_lag2']==df['breath_id']],[1],0)
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    df['RC_sum'] = df['R'] + df['C']
    df['RC_div'] = df['R'] / df['C']
    df['RC_sum'] = df['R'] + df['C']
    df['RC_div'] = df['R'] / df['C']
    #df['R'] = df['R'].astype(str)
    #df['C'] = df['C'].astype(str)
    df['ewm_u_in_mean'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).mean().reset_index(level=0,drop=True)
    df['ewm_u_in_std'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).std().reset_index(level=0,drop=True)
    df['ewm_u_in_corr'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).corr().reset_index(level=0,drop=True)
    df['rolling_10_mean'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).mean().reset_index(level=0,drop=True)
    df['rolling_10_max'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).max().reset_index(level=0,drop=True)
    df['rolling_10_std'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).std().reset_index(level=0,drop=True)
    df['expand_mean'] = df.groupby('breath_id')['u_in'].expanding(2).mean().reset_index(level=0,drop=True)
    df['expand_max'] = df.groupby('breath_id')['u_in'].expanding(2).max().reset_index(level=0,drop=True)
    df['expand_std'] = df.groupby('breath_id')['u_in'].expanding(2).std().reset_index(level=0,drop=True)
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] /df['count']
    df = df.fillna(0)
    return df

df = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
df = add_features(df)

# First Auto-EDA way (Pandas Profiling)
***Disadvantages:***
* Need a lot of RAM (for example this dataset + new features are too large for this memory)
* Not optimized for big datasets

***Advantages:***
* Very detailed
* Different correlations

In [None]:
# 1 Auto-EDA

# Commented out because dataset is too large for this method.

'''
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
import pandas_profiling
#Generating PandasProfiling Report
report = pandas_profiling.ProfileReport(df)
report
'''

# Second Auto-EDA way (SweetViz)
***Disadvantages:***
* Not a lot of info

***Advantages:***
* .html output with active buttons (you can click on any feature for more described info and change nbins for plots)

In [None]:
# 2 Auto-EDA
!pip install sweetviz
import sweetviz as sv
#Generating Sweetviz report
report = sv.analyze(df)
report.show_html("iris_EDA_report.html") # specify a name for the report

# Third Auto-EDA way (AutoViz)
***Disadvantages:***
* A lot of graphs in output
* Very hard to save it (you need to 'Save Version" with output and then save .png output)
* Only plots

***Advantages:***
* Large amount of different plots, you can find a lot of relations from here
* Very fast!

In [None]:
# 3 Auto-EDA
!pip install autoviz
!pip install xlrd
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
filename = "" # empty string ("") as filename since no file is being used for the data
sep = ","
dft = AV.AutoViz(
    '',
    sep=",",
    depVar="",
    dfte=df,
    header=0,
    verbose=0,
    lowess=False,
    chart_format="svg",
    max_rows_analyzed=150000,
    max_cols_analyzed=30,
     )

# Feature Importance
In this example I will use XGBoost with GPU.

*You can (need) to use different models and different scores.*

In [None]:
from xgboost import XGBRegressor
from xgboost import plot_importance

y = df.pop('pressure')
X = df

model = XGBRegressor(tree_method='gpu_hist')
model.fit(X, y)

plot_importance(model, max_num_features=10)
plt.show()

# Conclusion
As student I am more in tradional ways of EDA, where you are doing everything by your hands.
But sometimges such libs can be very useful.
For example, I can run cell with third Auto-EDA and then just check different plots with some tea for any relations and interesing things, and then check it by hands for more details.

I hope it was useful for new users :)