Simple visualization of the data mean values.

In this notebook, I calculated and visualized the mean of the SETI training data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from tqdm.notebook import tqdm

In [None]:
def getpath(x):
    return '../input/seti-breakthrough-listen/train/'+x[0]+f'/{x}.npy'

def get_testpath(x):
    return '../input/seti-breakthrough-listen/test/'+x[0]+f'/{x}.npy'

In [None]:
train = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
train['filepath'] = train.id.apply(getpath)

test = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')
test['filepath'] = test.id.apply(get_testpath)
train.head(2)

In [None]:
class SetiDataset():
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        filepath = self.df.filepath.iloc[idx]
        file_id = self.df.id.iloc[idx]
        target = self.df.target.iloc[idx]
        arr = np.load(filepath).astype('float')
        return arr, target, file_id

In [None]:
SD = SetiDataset(train)
file_ids = []
targets = []
means = []
for i in tqdm(range(len(SD))):
    arr, target, file_id = SD[i]
    means.append(arr.mean())
    targets.append(target)
    file_ids.append(file_id)

In [None]:
df = pd.DataFrame()
df['id'] = file_ids
df['means'] = means
df['target'] = targets

In [None]:
fig = plt.figure(figsize=(12,6))
plt.title('training data. npy_array global mean values', fontsize=20)
sns.histplot(data=df, x='means', hue='target', stat="density", common_norm=False)
plt.show()

The two distributions differ slightly in mean. 

Below code describes the expected label using the Laplace distribution.

In [None]:
a0, b0 = -3.3525338394155724e-08, 8.796053804628716e-06
a1, b1 = 5.061294421808332e-06, 9.360717422307713e-06

xlines = np.linspace(-0.0001, 0.0001, 1000)

fig = plt.figure(figsize=(12,6))
ax = plt.subplot(1,1,1)
plt.title('laplace distribution', fontsize=20)
sns.histplot(data=df, x='means', hue='target', stat="density", common_norm=False, ax=ax)
plt.plot(xlines, laplace.pdf(xlines, a0, b0), c ='darkblue')
plt.plot(xlines, laplace.pdf(xlines, a1, b1), c='darkorange')
plt.show()

The laplace distribution can not distinguished data exactly, 
but I'll try to calculate following.




In [None]:
from scipy.stats import laplace
def expect_label(x):
    x = np.array(x)
    p1 = 0.1
    a0 = -3.3525338394155724e-08
    b0 = 8.796053804628716e-06
    a1 = 5.061294421808332e-06
    b1 = 9.360717422307713e-06
    d = 0.000003
    px0 = laplace.pdf(x-d, a0, b0)+laplace.pdf(x+d, a0, b0)+4*laplace.pdf(x,a0,b0)
    px1 = laplace.pdf(x-d, a1, b1)+laplace.pdf(x+d, a1, b1)+4*laplace.pdf(x,a1,b1)
    return px1*p1/(px1*p1+px0*(1-p1))

In [None]:
vals = expect_label(df.means.to_numpy())
df['vals']=vals
fig = plt.figure(figsize=(12,8))
plt.subplot(2,2,1)
plt.title('training data. expected values', fontsize=12)
sns.histplot(data=df, x='vals', hue='target', stat="density", common_norm=True)
plt.subplot(2,2,2)
plt.title('training data. expected values density', fontsize=12)
sns.histplot(data=df, x='vals', hue='target', stat="density", common_norm=False)
plt.show()
df

Conclusion: In this notebook, we checked the mean value of the training data.

In [None]:
df.to_csv('means.csv', index=None)