# Take a first look on the training data tables

In [None]:
# packages
import numpy as np
import pandas as pd

from pylab import rcParams
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.graphics.mosaicplot import mosaic

import os

In [None]:
# file overview
!ls "/kaggle/input/rfcx-species-audio-detection"

In [None]:
# import files
df_TP = pd.read_csv('../input/rfcx-species-audio-detection/train_tp.csv')
df_FP = pd.read_csv('../input/rfcx-species-audio-detection/train_fp.csv')

In [None]:
df_TP.head()

In [None]:
df_FP.head()

In [None]:
features_numeric = ['t_min','t_max','f_min','f_max']

# True Positive Training File

In [None]:
df_TP.shape

In [None]:
df_TP.describe()

In [None]:
df_TP.species_id.value_counts().plot(kind='bar')
plt.title('Species ID [TP]')
plt.grid()
plt.show()

In [None]:
df_TP.songtype_id.value_counts().plot(kind='bar')
plt.title('Songtype ID [TP]')
plt.grid()
plt.show()

In [None]:
sns.pairplot(df_TP[features_numeric])
plt.show()

# False Positive Training File

In [None]:
df_FP.shape

In [None]:
df_FP.describe()

In [None]:
df_FP.species_id.value_counts().plot(kind='bar')
plt.title('Species ID [FP]')
plt.grid()
plt.show()

In [None]:
df_FP.songtype_id.value_counts().plot(kind='bar')
plt.title('Songtype ID [FP]')
plt.grid()
plt.show()

In [None]:
sns.pairplot(df_FP[features_numeric])
plt.show()

# Combine to one table

In [None]:
df_TP['label'] = 'TP'
df_FP['label'] = 'FP'
df = pd.concat([df_TP, df_FP], ignore_index=True)

## Compare time

In [None]:
sns.violinplot(data=df, x='label', y='t_min')
plt.title('Compare t_min')
plt.grid()
plt.show()

In [None]:
sns.violinplot(data=df, x='label', y='t_max')
plt.title('Compare t_max')
plt.grid()
plt.show()

## Compare frequency

In [None]:
sns.violinplot(data=df, x='label', y='f_min')
plt.title('Compare f_min')
plt.grid()
plt.show()

In [None]:
sns.violinplot(data=df, x='label', y='f_max')
plt.title('Compare f_max')
plt.grid()
plt.show()

## TP/FP by Species

In [None]:
pd.crosstab(df.species_id, df.label)

In [None]:
# normalized
pd.crosstab(df.species_id, df.label, normalize=True)

In [None]:
# graphical view
rcParams['figure.figsize'] = 10, 4
mosaic(df, ['species_id', 'label'], title='TP/FP vs Species')
plt.show()

## TP/FP by Songtype

In [None]:
pd.crosstab(df.songtype_id, df.label)

In [None]:
# normalized
pd.crosstab(df.songtype_id, df.label, normalize=True)

In [None]:
# graphical view
rcParams['figure.figsize'] = 10, 4
mosaic(df, ['songtype_id', 'label'], title='TP/FP vs Songtype')
plt.show()