In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import random
import os
import numpy as np
import torch
import matplotlib.pyplot as plt

In [None]:
plt.rcParams['figure.figsize'] = (20,10)
font = {'family' : 'DejaVu Sans',  'weight' : 'normal',  'size'  : 20}
plt.rc('font', **font)

In [None]:
df = pd.read_csv('../small_dataset_csvs/events_with_number_of_frames.csv')

In [None]:
df1 = df[['event_path','exposure','burial','field_joint','anode','free_span']]

In [None]:
fig1, ax1 = plt.subplots()
df1.iloc[:,1:].sum(axis=0).plot.pie(autopct='%1.1f%%',shadow=True, startangle=90,ax=ax1)
ax1.axis("equal")
plt.show()

In [None]:
df1.iloc[:,1:].sum(axis=0)

In [None]:
df

In [None]:
len(df)

In [None]:
SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [None]:
seq_length = 20

In [None]:
df = df[df.number_of_frames>=seq_length] # & df.number_of_frames <= 500]
len(df)

In [None]:
df = df[df.number_of_frames<=500]

In [None]:
len(df)

In [None]:
fig1, ax1 = plt.subplots()
df1.iloc[:,1:].sum(axis=0).plot.pie(autopct='%1.1f%%',shadow=True, startangle=90,ax=ax1)
ax1.axis("equal")
plt.show()

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
df['fold'] = 0

In [None]:
g = df.groupby('label')

In [None]:
i = g['number_of_frames'].quantile([0.05, 0.25, 0.5, 0.9]).unstack()
j = g['number_of_frames'].agg(['min', 'max'])

In [None]:
pd.concat([i, j], 1)

In [None]:
values = [111, 80.0, 34.0, 42.0, 79]

In [None]:
i.T.plot(subplots=True)
plt.show()

In [None]:
df['stratify_group'] = 'yo'

In [None]:
labels = df.label.unique()

In [None]:
for l, v in zip(labels, values):
    df.loc[df.label==l, 'stratify_group'] = np.char.add(
        df.loc[df.label==l, 'label'].values.astype(str),
        df.loc[df.label==l, 'number_of_frames'].apply(lambda x: f'_{int(x // v)}').values.astype(str)
    )

In [None]:
df

In [None]:
for fold_number, (train_index, val_index) in enumerate(skf.split(X=df['event_path'], y=df['stratify_group'])):
    df.loc[df.iloc[val_index].index, 'fold'] = fold_number

In [None]:
df

In [None]:
df.to_csv('../small_dataset_csvs/events_with_number_of_frames_stratified.csv', index=False)

In [None]:
df.fold.value_counts()

In [None]:
df.stratify_group.value_counts()

In [None]:
df = df[df['number_of_frames']>20]