## [TPS-AUG] Exploratory data analysis

- Input - 100 features
- Input type - Continous variables
- Output - 1 feature
- Output type - Continous

- Type of problem - Regression

## Train data

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold
from sklearn import model_selection
import xgboost as xgb
import optuna
import tqdm
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

In [None]:
# matplotlib setting
mpl.rcParams['figure.dpi'] = 200
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
print(f'Train Shape :  {train.shape}')
print(f'Test Shape :  {test.shape}')

In [None]:
target = train['loss']
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [None]:
train.head(15)

All features seem to be  Continous type.

In [None]:
train.info()

# Target columns

In [None]:
print('Target column basic statistics:')
train['loss'].describe()

In [None]:
train['loss'].value_counts()

In [None]:
train['loss'].quantile([0.25, 0.5 , 0.75,0.90])

- total unique values for loss - 43 .
- 90th quantile is 19.0 - Follows a shape of decay.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(17, 8))

target_cnt = train['loss'].value_counts().sort_index()

ax.bar(target_cnt.index, target_cnt,
       width=0.55, 
       edgecolor='black', 
       linewidth=0.7)
ax.margins(0.02, 0.05)

for i in range(30):
    ax.annotate(f'{target_cnt[i]/len(train)*100:.3}', xy=(i, target_cnt[i]+1000),
                   va='center', ha='center',)
fig.tight_layout()

plt.show()


## Now, lets compare Value counts in Train and test

In [None]:
features = [feature for feature in train.columns if feature not in ['id', 'loss']]
unique_values_train = np.zeros(2)
for feature in features:
    temp = train[feature].unique()
    unique_values_train = np.concatenate([unique_values_train, temp])
unique_values_train = np.unique(unique_values_train)

unique_values_test = np.zeros(2)
for feature in features:
    temp = test[feature].unique()
    unique_values_test = np.concatenate([unique_values_test, temp])
unique_values_test = np.unique(unique_values_test)

unique_value_feature_train = pd.DataFrame(train[features].nunique())
unique_value_feature_train = unique_value_feature_train.reset_index(drop=False)
unique_value_feature_train.columns = ['Features', 'Count']
unique_value_feature_test = pd.DataFrame(test[features].nunique())
unique_value_feature_test = unique_value_feature_test.reset_index(drop=False)
unique_value_feature_test.columns = ['Features', 'Count']



plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(4, 12), facecolor='#f6f5f5')
gs = fig.add_gridspec(1, 2)
gs.update(wspace=0.4, hspace=0.1)

background_color = "#f6f5f5"
#sns.set_palette(['#ffd514']*75)

ax0 = fig.add_subplot(gs[0, 0])
for s in ["right", "top"]:
    ax0.spines[s].set_visible(False)
ax0.set_facecolor(background_color)
ax0_sns = sns.barplot(ax=ax0, y=unique_value_feature_train['Features'], x=unique_value_feature_train['Count'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("Unique Values",fontsize=4, weight='bold')
ax0_sns.set_ylabel("Features",fontsize=4, weight='bold')
ax0_sns.tick_params(labelsize=4, width=0.5, length=1.5)
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0.text(0, -1.9, 'Unique Values - Train Dataset', fontsize=6, ha='left', va='top', weight='bold')
ax0.text(0, -1.2, 'feature_1, feature55 , feature 86 have less unique values in test set', fontsize=4, ha='left', va='top')
# data label
for p in ax0.patches:
    value = f'{p.get_width():.0f}'
    x = p.get_x() + p.get_width() + 7
    y = p.get_y() + p.get_height() / 2 
    ax0.text(x, y, value, ha='center', va='center', fontsize=4, 
            bbox=dict(facecolor='none', edgecolor='black', boxstyle='round', linewidth=0.3))

background_color = "#f6f5f5"
sns.set_palette(['#ff322d']*100)

ax1 = fig.add_subplot(gs[0, 1])
for s in ["right", "top"]:
    ax1.spines[s].set_visible(False)
ax1.set_facecolor(background_color)
ax1_sns = sns.barplot(ax=ax1, y=unique_value_feature_test['Features'], x=unique_value_feature_test['Count'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax1_sns.set_xlabel("Unique Values",fontsize=4, weight='bold')
ax1_sns.set_ylabel("Features",fontsize=4, weight='bold')
ax1_sns.tick_params(labelsize=4, width=0.5, length=1.5)
ax1_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax1_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
ax1.text(0, -1.9, 'Unique Values - Test Dataset', fontsize=6, ha='left', va='top', weight='bold')
ax1.text(0, -1.2, 'feature_1, feature55 , feature 86 have less unique values in test set', fontsize=4, ha='left', va='top')
for p in ax1.patches:
    value = f'{p.get_width():.0f}'
    x = p.get_x() + p.get_width() + 7
    y = p.get_y() + p.get_height() / 2 
    ax1.text(x, y, value, ha='center', va='center', fontsize=4, 
            bbox=dict(facecolor='none', edgecolor='black', boxstyle='round', linewidth=0.3))

plt.show()

## Scaling & Visualization

Taken from amazing notebook by @subinium - [link](https://www.kaggle.com/subinium/tps-aug-simple-eda)

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
features = [f'f{i}' for i in range(100)]
train[features] = ss.fit_transform(train[features])
test[features] = ss.transform(test[features])

In [None]:
fig, axes = plt.subplots(10,10,figsize=(12, 12))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.kdeplot(data=train, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    sns.kdeplot(data=test, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Average by class (by feature)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

# Feature observation

- Features with exponential decay curve f4,f6, f15, f22, f28 , f35, f44 , f49 , f52, f56, f59, f91, f89, f75 
     - Log scaling can be used
- Features with bell shaped curve - 29, f23, f31, f36, f68, f61 , f97
- Several feature has 2 peaks - f27, f32, f94, f51 
- Features with multiple peaks - f84, f32, f50

# Feature correlation in Test and Train data

In [None]:
background_color = "#f6f5f5"

fig = plt.figure(figsize=(24, 15), facecolor=background_color)
ax0 = fig.add_subplot(gs[0, 0])
colors = ["#2f5586", "#f6f5f5","#2f5586"]
colormap = mpl.colors.LinearSegmentedColormap.from_list("", colors)

ax0.set_facecolor(background_color)
ax0.text(-0.2, -1, 'Features Correlation on Train Dataset', fontsize=12, fontweight='bold', fontfamily='serif')
ax0.text(-0, -0.4, 'Highest correlation in the dataset is 0.6', fontsize=8, fontweight='light', fontfamily='serif')

'''ax1.set_facecolor(background_color)
ax1.text(-0.2, -1, 'Features Correlation on Test Dataset', fontsize=12, fontweight='bold', fontfamily='serif')
ax1.text(-0.3, -0.4, 'Features in test dataset resemble features in train dataset ', 
         fontsize=8, fontweight='light', fontfamily='serif')'''

train_corr = train.drop(columns = 'loss').corr()
train_mask = np.triu(np.ones_like(train_corr, dtype=bool))

sns.heatmap(train_corr, 
            square=True, 
            linewidth=0.2,
            cbar=False,
            mask=train_mask,
            annot=False,
            center=0,
            cmap=sns.diverging_palette(240, 10),
            #ax=ax0
           )

fig = plt.figure(figsize=(24, 15), facecolor=background_color)

ax0.set_facecolor(background_color)
ax0.text(-0.2, -1, 'Features Correlation on Train Dataset', fontsize=12, fontweight='bold', fontfamily='serif')
ax0.text(-0, -0.4, 'Highest correlation in the dataset is 0.6', fontsize=8, fontweight='light', fontfamily='serif')

test_corr = test.corr()
test_mask = np.triu(np.ones_like(test_corr, dtype=bool))

sns.heatmap(test_corr, 
            square=True, 
            linewidth=0.2,
            cbar=False,
            mask=test_mask,
            annot=False,
            center=0,
            cmap=sns.diverging_palette(240, 10),
            #ax=ax1
           )

plt.show()


# Looking at features that are most correlated in Train data

In [None]:
fig = plt.figure(figsize=(16, 16), facecolor='#f6f5f5')

train_corr1 = train_corr[train_corr > 0.01]
sns.heatmap(train_corr1, 
            square=True, 
            mask=train_mask,
            annot=False,
            cmap=sns.diverging_palette(240, 10),
            #ax=ax0
           )

In [None]:
train_corr = train.drop(columns = 'loss').corr()

# Lets compare the correlation matrix for Train and test

In [None]:
cols = train_corr.compare(test_corr)

In [None]:
col_list1 = []
for x in range(0,100,4):
    col_list1.append(cols.columns[x])
for x in range(1,100,4):
    col_list1.append(cols.columns[x])
    
col_list2 = []
for x in range(2,100,4):
    col_list2.append(cols.columns[x])
for x in range(3,100,4):
    col_list2.append(cols.columns[x])
    
train_corr.compare(test_corr).style.background_gradient(subset= col_list1, cmap='coolwarm')\
                                    .background_gradient(subset= col_list2, cmap='summer')

# Comparing all features in test and train - least variance

Lets look at correlation matrix of Test and train data, then try to look at areas that have maximum and minimum difference.

This might be useful in choosing features that have lesser variance in test and train data. 

In [None]:
# Comparing all features in test and train
'''train_corr.compare(test_corr).style.bar( subset= col_list1, color=['red'])\
                                        .bar( subset= col_list2, color=['yellow'])'''
'''
cols
new_cols = pd.DataFrame()
col_list1 = []
for x in range(0,100,4):
    new_cols[str(cols.columns[x])[:-9]] =  cols[cols.columns[x]] -  cols[cols.columns[x+1]] 
    x = x+2
    new_cols[str(cols.columns[x])[:-9]] =  cols[cols.columns[x]] -  cols[cols.columns[x+1]] 

#new_cols = new_cols[new_cols > -0.01]
new_cols.style.bar(  color=['yellow', 'red'])'''

In [None]:
fig = plt.figure(figsize=(16, 16), facecolor='#f6f5f5')

new_cols = new_cols[new_cols < 0.0001]
#new_cols.style.bar(  color=['yellow', 'red'])
sns.heatmap(new_cols)

# Heatmap of the comparison

In [None]:
sns.heatmap(train_corr.compare(test_corr))

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')


#from sklearn.model_selection import train_test_split
#train , train_extra = train_test_split(train , test_size = 0.9)

train = train.drop( columns = 'id')
train


pred_col = 'loss'

y_train = train[pred_col]
x_train = train.drop(columns = pred_col)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

ss = StandardScaler()
mm = MinMaxScaler()

features = [f'f{i}' for i in range(100)]
mm_x_train = mm.fit_transform(x_train[features])
mm_test = mm.transform(test[features])

In [None]:
mm_x_train = pd.DataFrame(mm_x_train , columns = [features])

In [None]:
log_features = ['f4' , 'f6' , 'f15' , 'f22' , 'f28' , 'f35' , 'f44']

print(x_train[log_features].describe())

index = 0
plt.figure(figsize = (16,14))
    
for z in log_features:
    plt.subplot(4,4,index+1)
    sns.histplot(x_train[z])
    index += 1
plt.xlabel('Before log scaling', ha='center', fontweight='bold')
 
    
index = 0
plt.figure(figsize = (16,14))
    
for z in log_features:
    plt.subplot(4,4,index+1)
    sns.histplot(np.log(x_train[z]))
    index += 1
plt.xlabel('After log scaling', ha='center', fontweight='bold')

These features also contain few negative values, so using log scaling might hurt that.

In [None]:
sns.heatmap(train[log_features + ['loss']].corr() , annot = True,
                       cmap=sns.diverging_palette(240, 10),)

In [None]:
bell_features = ['f29' , 'f23' , 'f31' , 'f36']
    
index = 0
plt.figure(figsize = (16,14))
    
for z in bell_features:
    plt.subplot(4,4,index+1)
    sns.histplot(x_train[z])
    index += 1
plt.xlabel('Bell shaped', ha='center', fontweight='bold')
 

In [None]:
sns.heatmap(train[bell_features + ['loss']].corr() , annot = True,
                       cmap=sns.diverging_palette(240, 10),)

In [None]:
two_peak_features = ['f27' , 'f32' , 'f94' , 'f51']
    
index = 0
plt.figure(figsize = (16,14))
    
for z in two_peak_features:
    plt.subplot(4,4,index+1)
    sns.histplot(x_train[z])
    index += 1
plt.xlabel('two_peak_features', ha='center', fontweight='bold')

In [None]:
sns.heatmap(mm_x_train[two_peak_features].corr() , annot = True,
                       cmap=sns.diverging_palette(240, 10),)

# Converting two peaked features into categorical data

In [None]:
x_train['cat_f27'] = x_train['f27']

In [None]:
pd.cut(x_train['f27'] , bins = [20000 , 30000] , labels = ['cat2']) #.describe()

# Baseline model - Xgboost + Optuna

# Transforming features - MinMaxScaler

In [None]:
#train , train_extra = train_test_split(train , test_size = 0.9)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

ss = StandardScaler()
mm = MinMaxScaler()

features = [f'f{i}' for i in range(100)]
x_train = mm.fit_transform(x_train[features])
test = mm.transform(test[features])


In [None]:
import xgboost as xgb
from xgboost import XGBRegressor ,XGBClassifier

xgb_params = {'n_estimators': 1800,
 'subsample': 0.8,
 'colsample_bytree': 0.9,
 'eta': 0.008353853073431708,
 'reg_alpha': 24,
 'reg_lambda': 74,
 'max_depth': 11,
 'min_child_weight': 9,
 'tree_method': 'gpu_hist',
 'random_state': 42}

model_xgboost = XGBRegressor( **xgb_params) # define

print(model_xgboost.objective)
print(model_xgboost)

model_xgboost.fit(x_train, y_train) #fit

preds = model_xgboost.predict(test) #predict 
print(preds.shape)
preds

df_sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
df_sub['loss'] = preds
df_sub.to_csv('submission.csv' , index = False)
df_sub.head(10)

# Feature importance

In [None]:
print(model_xgboost.feature_importances_)

In [None]:
plt.figure(figsize = (15,3))
plt.bar(range(len(model_xgboost.feature_importances_)), model_xgboost.feature_importances_ )
plt.show()

# Work in progres

# If you find this helpful, please upvote