In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# June 2021 Tabular Playground Series - Exploratory data analysis
If you're here, you know the score, especially if you did May's version, as I did. This dataset has more rows and more columns, and furthermore there are 9 classes to predict this time instead of 4. The column names are devoid of meaning and there's no insight as to what the data actually represents in real-world terms. We have a bunch of columns of numbers and each row belongs to one of nine otherwise indistinguishable classes. To me, it's about testing out and tweaking appropriate models than about manipulating data.

In [None]:
#Load the data
train_X = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv')
test_X = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv')

The training set has 200,000 rows and 77 columns. One of them is a row ID and useless to us. The last one is the target labels, so there are 75 features. Let's check the data quality. Before that, let's drop the ID column and separate the target column.

## Data setup and cleaning

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Name the rows by their 'id' and drop the superfluous 'id' column
train_X = train_X.drop('id', axis = 1)
test_X = test_X.drop('id', axis = 1)

In [None]:
# Split off the target, also create a copy with just the number part of the class - might need it later!
train_y = train_X.pop('target')
train_y_num = [int(x[-1])-1 for x in train_y]

In [None]:
# Missing values? 
print('Missing values in training set: ', train_X.isnull().sum().sum())
print('Missing values in test set: ', test_X.isnull().sum().sum())

In [None]:
# Any duplicates? - Yes, drop them all - we won't make any assumption which one is right
dupes = train_X.duplicated(keep=False)
print(dupes.value_counts())
dropthese = list(dupes[dupes == True].index)
train_X = train_X.drop(dropthese)
train_y = train_y.drop(dropthese) # Drop their class labels too
train_y_num = [int(x[-1])-1 for x in train_y]

Now we want to check if any remaining training observations duplicate rows in the test set. We won't necessarily do anything about it, but it might help us to force the test predictions to match the training version. Last time, this didn't help me - go figure!

In [None]:
# Yes there are
data_X = train_X.append(test_X, ignore_index = True)
alldupes = data_X.duplicated(keep=False)
print(alldupes.value_counts())
notethese = list(alldupes[alldupes == True].index)
# More to come later maybe

## Class distribution
Now we're ready to get to some exploring. Let's look at the distribution of predicted classes, and make a dumb prediction based on it.

In [None]:
# Count the instances of each class and divide by the length
dumb_preds = train_y.value_counts() / len(train_y)

In [None]:
fig, ax = plt.subplots(figsize=(9,6))
sns.countplot(data=train_y, x=train_y, order=sorted(train_y.unique()), ax=ax)
ax.set_title("Target distribution", size=16, weight='bold')
plt.show()

In [None]:

submission1 = pd.DataFrame(columns=sorted(dumb_preds.index), index=test_X.index + 200000)
for x in submission1.columns:
    submission1[x] = dumb_preds[x]
# submission1.head() -- Quality check if desired
submission1.index.name = 'id'
submission1.to_csv('submission1_naive.csv')

## Feature distributions
Last time the features had most values = 0 and many were similarly distributed. Let's see if that is the case here. Since there are more features these graphs may be a bit more unwieldy than last time. It does appear that there are differences in the ranges of some features between the training and test set. I might try scaling the combined data sets to the same scale and see if that helps at all.

In [None]:
train_X.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
test_X.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

## Check pairwise correlations
It doesn't look like there are any meaningful pairwise correlations to me.

In [None]:
datacorr = data_X.corr()
plt.subplots(figsize=(16,16))
sns.heatmap(datacorr)
plt.show()

## Check unique value counts
Let's look to see how many values each feature takes on. From the graph below, it looks like all of the features take on at least 15 different values. Low counts would be suspected of being categories. Of course, some of them can still be - but we have no underlying knowledge of where the data came from, so we can't know this for sure. Because it makes my life easier, I'm going to pretend I did not have this thought. In addition, a few features in the test set take on values that aren't in the training set. Again we are going to overlook that for now.

In [None]:
# This was shamelessly stolen from a May TPS competitor
fig, ax = plt.subplots(1, 1, figsize=(18, 6))

y = np.array([train_X[f'feature_{i}'].nunique() for i in range(75)])
y2 = np.array([test_X[f'feature_{i}'].nunique() for i in range(75)])
comp = y-y2

ax.bar(range(75), y2, alpha=0.7, label='Test Dataset')
ax.bar(range(75),  comp*(comp>0), bottom=y2, alpha=0.7, label='Test > Train')
ax.bar(range(75), comp*(comp<0), bottom=y2-comp*(comp<0), alpha=0.7, label='Test < Train')

ax.set_yticks(range(0, 120, 10))
ax.margins(0.02)
ax.grid(axis='y', linestyle='--', zorder=5)
ax.set_title('# of Features Unique Values (Train/Test)', loc='left', fontweight='bold')
ax.legend()
plt.show()

## XGBoost
I'm going to try it without any messing around.

In [None]:
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split

In [None]:
model1 = XGBClassifier(random_state = 14000605, use_label_encoder=False)
model1.fit(train_X, train_y_num)

In [None]:
preds = model1.predict_proba(test_X)
submission2 = pd.DataFrame(preds, columns=sorted(dumb_preds.index), index=test_X.index + 200000)
submission2.index.name = 'id'
submission2.head()
submission2.to_csv('submission2_xgb.csv')

## That's it for now
If any of you have any commentary or wisdom - please drop it in the comments! 