In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt

## Reading and Understanding Data

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/data.csv')
data.shape

In [None]:
data.info()

In [None]:
data.describe()

## Missing Values


In [None]:
data.isnull().sum()

In [None]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'])
print(missing_data[missing_data.Percent>0])

## Unique Values in Columns

In [None]:
for col in data.columns[1:]:
    n = len(pd.unique(data[col]))
    print("Column - " , col , "Unique Values" , n,end='\n')

Column beginning with F_2 have less than 20 Unique values. These are categorical columns in the dataset, converting them to object type

In [None]:
for feature in data.columns:
        if feature.startswith('F_2'):
            data[feature] = data[feature].astype('object')

In [None]:
sns.set_style("whitegrid")

In [None]:
fig, ax = plt.subplots(5, 5, figsize=(24, 16))    
i = 1
for feature in data.columns:
    if feature.startswith('F_2'):
        plt.subplot(5, 5, i)
        ax = sns.countplot(data=data, x=feature)
        ax.set_facecolor('white')
        i += 1
plt.suptitle('Categorical Features Count Plot',fontsize=20)
plt.show()

In [None]:
fig, ax = plt.subplots(3, 5, figsize=(24, 16))    
i = 1
for feature in data.columns:
    if  feature.startswith('F_1'):
        plt.subplot(3, 5, i)
        ax = sns.kdeplot(data=data, x=feature,shade=True)
        ax.set_facecolor('white')
        i += 1
plt.suptitle('F_1 Features KDE Plot',fontsize=20)
plt.show()

In [None]:
fig, ax = plt.subplots(5, 5, figsize=(24, 16))    
i = 1
for feature in data.columns:
    if  feature.startswith('F_3'):
        plt.subplot(5, 5, i)
        ax = sns.kdeplot(data=data, x=feature,shade=True)
        ax.set_facecolor('white')
        i += 1
plt.suptitle('F_3 Features KDE Plot',fontsize=20)
plt.show()

In [None]:
fig, ax = plt.subplots(3, 5, figsize=(24, 16))    
i = 1
for feature in data.columns:
    if  feature.startswith('F_4'):
        plt.subplot(3, 5, i)
        ax = sns.kdeplot(data=data, x=feature, shade=True)
        ax.set_facecolor('white')
        i += 1
plt.suptitle('F_4 Features KDE Plot',fontsize=20)
plt.show()

In [None]:
data.describe()

## Imputating Missing Values

In [None]:
from tqdm.notebook import tqdm


import xgboost


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer



In [None]:
xgb = xgboost.XGBRegressor(
        n_estimators=350,
        random_state=100,
        tree_method='gpu_hist'
    
    
    )
imputer = IterativeImputer(
    estimator=xgb,
    missing_values=np.nan,
    max_iter=10,
    initial_strategy='mean',
    imputation_order='ascending',
    verbose=2,
    random_state=100
)

data[:] = imputer.fit_transform(data)

In [None]:
submission =  pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = data.loc[row, col]

submission.to_csv("submission.csv")
