In [None]:
import pandas as pd 
import numpy as np


## This notebook analyzes the (dis)similarity between the train.csv & test.csv

#### Checking for Original Dataset...

![img](https://cdn-images-1.medium.com/max/800/1*BY-0dhr8UTgJRsmdt3bpgQ.png)
![img2](https://miro.medium.com/max/847/1*L8Ua86qfwVRJAnaH_KZdFQ.png)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')

test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

In [None]:
test['is_train'] = 0
train['is_train'] = 1

In [None]:
train.drop(['id'], axis = 1, inplace = True)
test.drop(['id'], axis = 1, inplace = True)

In [None]:
df_combine = pd.concat([train, test], axis=0, ignore_index=True)
#dropping ‘target’ column as it is not present in the test
df_combine = df_combine.drop('target', axis =1)
y = df_combine['is_train'].values #labels
x = df_combine.drop('is_train', axis=1).values #covariates or our independent variables
tst, trn = test.values, train.values

In [None]:
from sklearn.ensemble import RandomForestClassifier

m = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y.shape)

In [None]:
from sklearn.model_selection import StratifiedKFold as SKF
skf = SKF(n_splits=20, shuffle=True, random_state=100)
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y)):
    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    m.fit(X_train, y_train)
    probs = m.predict_proba(X_test)[:, 1] #calculating the probability
    predictions[test_idx] = probs

In [None]:
from sklearn.metrics import roc_auc_score as AUC
print("ROC-AUC for train and test distributions:", AUC(y, predictions))

No evidence of strong covariance shift. Majority of observations come from a feature space not specific to train or test set.

Finding relevant features

![](https://cdn-images-1.medium.com/max/800/1*THOmXNUuCs92HZWTNJYaDw.png)

#### Checking for DAE extracted features, TE extracted features, LE features.

In [None]:
df_dae = pd.read_csv('../input/dae-te-le-tps0521/df_dae.csv')


y = pd.read_csv('../input/dae-te-le-tps0521/y.csv')


df_dae.head()



In [None]:
x_dae = df_dae[:len(y)]

x_tst_dae = df_dae[len(y):]

In [None]:
x_tst_dae['is_train'] = 0
x_dae['is_train'] = 1

In [None]:

df_combine = pd.concat([x_dae, x_tst_dae], axis=0, ignore_index=True)
#dropping ‘target’ column as it is not present in the test
#df_combine = df_combine.drop('target', axis =1)
y_ = df_combine['is_train'].values #labels
x = df_combine.drop('is_train', axis=1).values #covariates or our independent variables
tst, trn = test.values, train.values


m = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y_.shape)


skf = SKF(n_splits=20, shuffle=True, random_state=100)
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y_)):
    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y_[train_idx], y_[test_idx]
    m.fit(X_train, y_train)
    probs = m.predict_proba(X_test)[:, 1] #calculating the probability
    predictions[test_idx] = probs
    
    
from sklearn.metrics import roc_auc_score as AUC
print("ROC-AUC for train and test distributions:", AUC(y_, predictions))

Checking for TE features obtained

In [None]:
df_te = pd.read_csv('../input/dae-te-le-tps0521/df_te.csv')

x_te = df_te[:len(y)]

x_tst_te = df_te[len(y):]

df_te.head()

In [None]:
x_tst_te['is_train'] = 0
x_te['is_train'] = 1

In [None]:

df_combine = pd.concat([x_te, x_tst_te], axis=0, ignore_index=True)
#dropping ‘target’ column as it is not present in the test
#df_combine = df_combine.drop('target', axis =1)
y_ = df_combine['is_train'].values #labels
x = df_combine.drop('is_train', axis=1).values #covariates or our independent variables
tst, trn = test.values, train.values


m = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y_.shape)


skf = SKF(n_splits=20, shuffle=True, random_state=100)
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y_)):
    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y_[train_idx], y_[test_idx]
    m.fit(X_train, y_train)
    probs = m.predict_proba(X_test)[:, 1] #calculating the probability
    predictions[test_idx] = probs
    
    
from sklearn.metrics import roc_auc_score as AUC
print("ROC-AUC for train and test distributions:", AUC(y_, predictions))

Checking for LE features

In [None]:
df_le = pd.read_csv('../input/dae-te-le-tps0521/df_le.csv')

x_le = df_le[:len(y)]

x_tst_le = df_le[len(y):]

df_le.head()

In [None]:
x_tst_le['is_train'] = 0
x_le['is_train'] = 1

In [None]:

df_combine = pd.concat([x_le, x_tst_le], axis=0, ignore_index=True)
#dropping ‘target’ column as it is not present in the test
#df_combine = df_combine.drop('target', axis =1)
y_ = df_combine['is_train'].values #labels
x = df_combine.drop('is_train', axis=1).values #covariates or our independent variables
tst, trn = test.values, train.values


m = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y_.shape)


skf = SKF(n_splits=20, shuffle=True, random_state=100)
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y_)):
    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y_[train_idx], y_[test_idx]
    m.fit(X_train, y_train)
    probs = m.predict_proba(X_test)[:, 1] #calculating the probability
    predictions[test_idx] = probs
    
    
from sklearn.metrics import roc_auc_score as AUC
print("ROC-AUC for train and test distributions:", AUC(y_, predictions))

Checking for concatenated feature set

In [None]:
df_combined = pd.concat([df_dae, df_le, df_te], axis = 1)

df_x = df_combined[:len(y)]

df_x_tst = df_combined[len(y):]

df_combined.head()

In [None]:
df_x_tst['is_train'] = 0
df_x['is_train'] = 1

In [None]:

df_combine = pd.concat([df_x, df_x_tst], axis=0, ignore_index=True)
#dropping ‘target’ column as it is not present in the test
#df_combine = df_combine.drop('target', axis =1)
y_ = df_combine['is_train'].values #labels
x = df_combine.drop('is_train', axis=1).values #covariates or our independent variables
tst, trn = test.values, train.values


m = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y_.shape)


skf = SKF(n_splits=20, shuffle=True, random_state=100)
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y_)):
    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y_[train_idx], y_[test_idx]
    m.fit(X_train, y_train)
    probs = m.predict_proba(X_test)[:, 1] #calculating the probability
    predictions[test_idx] = probs
    
    
from sklearn.metrics import roc_auc_score as AUC
print("ROC-AUC for train and test distributions:", AUC(y_, predictions))

Conclusion : DAE + TE + LE features have similar distribution in the training and test set.<br>
### Meaning trust your Cross-Validation score for LB.