In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import random
import warnings
warnings.filterwarnings("ignore")

from pandas_profiling import ProfileReport as profile

import pkg_resources as pkg
print( f"pandas_profiling version: {pkg.get_distribution('pandas_profiling').version}")

import matplotlib.pyplot as plt        
import seaborn as sns  

### Utils

In [None]:
def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')

### Data load

In [None]:
RANDOM_SEED = 42
PROFILE = True

seeding(RANDOM_SEED)

train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

### Check for missing data

In [None]:
## display missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

### Target balance

In [None]:
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.countplot(x=train.target)
plt.title('train.Target', fontsize=15)
plt.show()

### EDA with Panda Profile

In [None]:
## Takes forever with this amount of data, so do it just once :)
## Also minimal=False is very useful for relationships between features

train_profile = profile(train, title="Train Data", minimal=True)
display(train_profile)

In [None]:
skewed_cols = ['A0T0G0C10','A0T0G1C9','A0T0G2C8','A0T0G8C2','A0T0G9C1','A0T0G10C0','A0T1G1C8','A0T1G8C1','A0T1G9C0','A0T2G0C8',
    'A0T2G8C0','A0T3G0C7','A0T3G7C0','A0T10G0C0','A1T0G0C9','A1T0G8C1','A1T0G9C0','A1T1G0C8','A1T1G8C0','A2T0G0C8','A2T0G8C0',
    'A2T1G0C7','A3T0G0C7','A3T0G7C0','A10T0G0C0']

## 
## to reduce skewness we can use np.log


In [None]:
categorical_cols = ['A0T0G9C1', 'A0T0G10C0', 'A0T1G0C9', 'A0T1G9C0', 'A0T2G0C8', 'A0T2G8C0', 'A0T10G0C0', 'A1T0G0C9', 'A1T0G9C0',
    'A1T1G8C0','A2T0G0C8','A2T0G8C0','A10T0G0C0']

## 
## columns that have distnict values as less than 0.1
## could be a category ???
## we will need to compare with test dataset profile

### Profile test dataset

In [None]:
test_profile = profile(test, title="Test Data", minimal=True)
display(test_profile)

In [None]:
test_skewed_cols = ['A0T0G0C10','A0T0G1C9','A0T0G2C8','A0T0G8C2','A0T0G9C1','A0T0G10C0','A0T1G1C8','A0T1G8C1',
    'A0T1G9C0','A0T2G0C8','A0T2G1C7','A0T2G8C0','A0T3G0C7','A0T3G7C0','A0T9G1C0','A1T0G0C9',
    'A1T0G8C1','A1T0G9C0','A1T1G0C8','A2T0G0C8','A2T0G8C0','A3T0G0C7','A3T0G7C0','A9T0G0C1','A10T0G0C0']

In [None]:
test_categorical_cols = [ 'A0T0G0C10','A0T0G1C9','A0T0G9C1','A0T0G10C0','A0T1G0C9','A0T1G9C0','A0T10G0C0',
    'A1T0G0C9','A1T0G9C0','A10T0G0C0']


In [None]:
for col in skewed_cols:
    if col not in test_skewed_cols:
        print(f'SKEWED in train not in test: {col}')
        
for col in test_skewed_cols:
    if col not in skewed_cols:
        print(f'SKEWED in test not in train: {col}')

In [None]:
for col in categorical_cols:
    if col not in test_categorical_cols:
        print(f'CAT in train not in test: {col}')
        
for col in test_categorical_cols:
    if col not in categorical_cols:
        print(f'CAT in test not in train: {col}')