# Imports and Setup

In [None]:
from IPython.display import HTML
from IPython.display import clear_output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import os
from pathlib import Path
import datetime

import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 51

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

def getax(width=10):
    fig, ax =plt.subplots(1, 1, figsize=(width, 5), layout='constrained')
    return ax

### Manual Data Entry

In [None]:
# https://www.kaggle.com/competitions?searchQuery=Tabular+Playground+Series
    
manual_data_entry = [
    (
        'tabular-playground-series-jan-2021',
        'Predict a continuous target based on 14 continuous features',
        'RMSE',
        'Regression',
    ),
    (
        'tabular-playground-series-feb-2021',
        'Predict a continuous target based on 10 categorical and 14 continious features',
        'RMSE',
        'Regression',
    ),
    (
        'tabular-playground-series-mar-2021',
        'Predict a binary target based on 19 categorical and 11 continious features',
        'AUC-ROC',
        'Binary Classification',
    ),
    (
        'tabular-playground-series-apr-2021',
        'Sythetic data generated using CTGAN based on the Titanic Dataset',
        'Accuracy',
        'Binary Classification',
    ),
    (
        'tabular-playground-series-may-2021',
        'Sythetic data generated using CTGAN based on a real eCommerce dataset that predicts a category of a product given 50 discreet features',
        'Log Loss',
        'Classification',
    ),
    (
        'tabular-playground-series-jun-2021',
        'Extended Version of the May 2021 TPS, 75 discreet features',
        'Log Loss',
        'Classification',
    ),
    (
        'tabular-playground-series-jul-2021',
        'Predict air polituion (3 target values) based on basic weather information and values of 5 sensors (8 features total)',
        'RMSLE',
        'TimeSeries Regression',
    ),
    (
        'tabular-playground-series-aug-2021',
        'Predict an integer target loss based on 1 discreet and 99 continuous',
        'RMSE',
        'Regression',
    ),
    (
        'tabular-playground-series-sep-2021',
        'Predict probability of insurance claim based on 118 features',
        'AUC-ROC',
        'Binary Classification',
    ),
    (
        'tabular-playground-series-oct-2021',
        'Predict a binary target based on 285 features, mix of continuous and binary',
        'AUC-ROC',
        'Binary Classification',
    ),
    (
        'tabular-playground-series-nov-2021',
        'Predict a binary target based on 100 continious features.',
        'AUC-ROC',
        'Binary Classification',
    ),
    (
        'tabular-playground-series-dec-2021',
        'Sythetic data generated using GAN based on Forest Cover Type Prediction dataset, 54 features, mix of continuous and binary',
        'Accuracy',
        'Classification',
    ),
    (
        'tabular-playground-series-jan-2022',
        'Predict a full year worth of sales for three items at two stores located in three different countries, 4 features',
        'SMAPE',
        'TimeSeries Regression',
    ),
    (
        'tabular-playground-series-feb-2022',
        'Predict bacteria species based on repeated lossy measurements of DNA snippets, 286 features',
        'Accuracy',
        'Classification',
    ),
    (
        'tabular-playground-series-mar-2022',
        'Predict car traffic flow based on time, space, and directional features, 4 features',
        'MAE',
        'TimeSeries Regression',
    ),
    (
        'tabular-playground-series-apr-2022',
        'Binary classification based on 60 second sequences of continuous features from 13 sensors',
        'AUC-ROC',
        'Binary Classification',
    ),
    (
        'tabular-playground-series-may-2022',
        'Predict the binary state of a machine  based on 17 continuous and 14 continuous features',
        'AUC-ROC',
        'Binary Classification',
    ),
    (
        'tabular-playground-series-jun-2022',
        'Predict all missing continious values in the dataset with 80 features, 55 continuous and 25 categorical',
        'RMSE',
        'Regression',
    ),
]

df_manual = pd.DataFrame(manual_data_entry, columns = ['Name', 'Description', 'Scoring', 'Prediction type'])#.set_index('Name')
# df_manual

### Read input csv's and extract metadata

In [None]:
base_path = Path('/kaggle/input/')
competition_dirs = [competition_dir for competition_dir in base_path.iterdir() 
                    if competition_dir.is_dir() and competition_dir.name.startswith('tabular-playground-series')]

competition_dirs.sort(key=lambda dir_name: datetime.datetime.strptime(dir_name.name[-8:], '%b-%Y').strftime('%Y-%m'), reverse=True) 
competition_dirs


def get_df_metadata(comp_dir, nrows=None):
    metadata = {}
    
    metadata['Name'] = comp_dir.name
    train_csv_path = comp_dir.joinpath('train.csv')
    
    if train_csv_path.exists():
        metadata['Train - file (MB)'] = round(train_csv_path.stat().st_size / 2**20)
        df_train = pd.read_csv(train_csv_path, nrows=nrows)
        metadata['Train - in mem (MB)'] = round(df_train.memory_usage(deep=True).sum()/ 2**20 )
        metadata['Train - shape'] = df_train.shape
        metadata['Train - NaN count'] = df_train.isna().sum().sum()

        test_csv_path = comp_dir.joinpath('test.csv')
        metadata['Test - file (MB)'] = round(test_csv_path.stat().st_size / 2**20)
        df_test = pd.read_csv(train_csv_path, nrows=nrows)
        metadata['Test - in mem (MB)'] = round(df_test.memory_usage(deep=True).sum()/ 2**20 )
        metadata['Test - shape'] = df_test.shape
        metadata['Test - NaN count'] = df_test.isna().sum().sum()
    else:
        train_csv_path = comp_dir.joinpath('data.csv')
        metadata['Train - file (MB)'] = round(train_csv_path.stat().st_size / 2**20)
        df_train = pd.read_csv(train_csv_path, nrows=nrows)
        metadata['Train - in mem (MB)'] = round(df_train.memory_usage(deep=True).sum()/ 2**20 )
        metadata['Train - shape'] = df_train.shape
        metadata['Train - NaN count'] = df_train.isna().sum().sum()

    return metadata


competitions_metadata = []

for directory in competition_dirs:
    competitions_metadata.append(get_df_metadata(directory))
#     clear_output(wait=True)
#     pd.DataFrame(descriptions)

df_metadata = pd.DataFrame(competitions_metadata)

# Intro

Hi all !!!

I really like the TPS competitions and I am grateful to the Kaggle Team that they find the time to organize them. I try to participate in as many as I have time to, but on top of focusing on current competitions I found that it is beneficial for me to go back to previous challenges to revisit the problems and solutions. In this notebook I want to summarise all previous competitions and present the data that may support you in finding a dataset that you want to practice on.
This is the V1 of my work and there are few things I want to add in very near future:
- link to top 3 solutions and a summary of the algos used
- add few extra fields to the table (suggestions are welcome)
- add info on features and data types per competition

In [None]:
df = pd.merge(df_metadata, df_manual, on=['Name'])

base_url = 'https://www.kaggle.com/competitions/'

def get_yyyy_mm_from_name(comp_name: str) -> str:
    return datetime.datetime.strptime(comp_name[-8:], '%b-%Y').strftime('%Y-%m')
    
df['Name'] = df['Name'].apply(lambda x: f'<a href="{base_url}{x}" >{get_yyyy_mm_from_name(x)}</a>')

column_multi_idx = []
for c in df.columns.to_list():
    if ' - ' in c:
        column_multi_idx.append(c.split(' - ', maxsplit=2))
    else:
        column_multi_idx.append((c, ''))
df.columns = pd.MultiIndex.from_tuples(column_multi_idx)


HTML(df.to_html(escape=False))

# EDA

In [None]:
data = pd.DataFrame()
data['name'] = df['Name'].str[-11:-4]
data['rows'] = df[('Train', 'shape')].apply(lambda x : x[0])
data['cols'] = df[('Train', 'shape')].apply(lambda x : x[1])

ax = getax(20)
_ = ax.bar(x=data['name'], height=data['rows'], width=0.75)
_ = ax.set_title('Number of rows per Competition')

ax = getax(20)
_ = ax.bar(x=data['name'], height=data['cols'], width=0.75)
_ = ax.set_title('Number of features per Competition')

In [None]:
data= df['Prediction type'].value_counts(normalize=True)
_ = sns.barplot(ax=getax(6), x=data.index, y=data.values).set(title='% of Prediction types')

In [None]:
data= df['Scoring'].value_counts(normalize=True)
_ = sns.barplot(ax=getax(11), x=data.index, y=data.values).set(title='% of Scoring methodes')