In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import pandas as pd
import re
import shutil
from sklearn.model_selection import StratifiedKFold
import random
import os
import numpy as np
import torch
import matplotlib.pyplot as plt

In [None]:
plt.rcParams['figure.figsize'] = (20,10)
font = {'family' : 'DejaVu Sans',  'weight' : 'normal',  'size'  : 20}
plt.rc('font', **font)

In [None]:
HOME_FOLDER = '/media/raid/astamoulakatos/nsea_frame_sequences/centre_Ch2/'

noOfFiles = 0
noOfDir = 0

In [None]:
for base, dirs, files in os.walk(HOME_FOLDER):
    print('Looking in : ',base)
    for directories in dirs:
        noOfDir += 1
    for Files in files:
        noOfFiles += 1

In [None]:
print('Number of files',noOfFiles)
print('Number of Directories',noOfDir)
print('Total:',(noOfDir + noOfFiles))

In [None]:
number_of_frames = []
event_path = []
for d in os.listdir(HOME_FOLDER):
    label_path = os.path.join(HOME_FOLDER,d)
    for l in os.listdir(label_path):
        file_path = os.path.join(label_path,l)
        #print(len(os.listdir(file_path)), file_path)
        number = len(os.listdir(file_path))
        number_of_frames.append(number) 
#         if number > 400:
#             shutil.rmtree(file_path)
#         else:
#             number_of_frames.append(number)                          
        event_path.append(file_path)

In [None]:
df = pd.DataFrame(columns = ['event_path', 'number_of_frames'])

In [None]:
df.event_path = event_path
df.number_of_frames = number_of_frames

In [None]:
df

In [None]:
df.number_of_frames.max()

In [None]:
df.number_of_frames.min()

In [None]:
df.number_of_frames.mean()

In [None]:
df['exposure'] = 0
df['burial'] = 0
df['field_joint'] = 0
df['anode'] = 0
df['free_span'] = 0

In [None]:
labels = []
for i in range(len(df)):
    m = re.search('Ch2/(.+?)/S', str(df.event_path[i]))
    #m = re.search('egs/(.+?)/S', str(df.event_path[i]))
    boom = m.group(1)
    if boom == 'exp_and':
        df.exposure[i] = 1
        df.anode[i] = 1
    if boom == 'exp':
        df.exposure[i] = 1
    if boom == 'bur':
        df.burial[i] = 1
    if boom == 'exp_fs':
        df.exposure[i] = 1
        df.free_span[i] = 1
    if boom == 'exp_fj':
        df.exposure[i] = 1
        df.field_joint[i] = 1
    labels.append(boom)

In [None]:
df['label'] = labels

In [None]:
df1 = df[['event_path','exposure','burial','field_joint','anode','free_span']]

In [None]:
fig1, ax1 = plt.subplots()
df1.iloc[:,1:].sum(axis=0).plot.pie(autopct='%1.1f%%',shadow=True, startangle=90,ax=ax1)
ax1.axis("equal")
plt.show()

# events

In [None]:
df1.iloc[:,1:].sum(axis=0)

# frames

In [None]:
df.groupby('label')['number_of_frames'].sum()

# exclude events with less than 50 frames (2 secs)

In [None]:
df.number_of_frames.min()

In [None]:
df = df[df.number_of_frames>=50]
df = df[df.number_of_frames<=400]

In [None]:
df

In [None]:
df.label.unique()

# less exposure

In [None]:
df_exp = df[df.label=='exp']

In [None]:
df_exp = df_exp.sample(frac=0.3)

In [None]:
df_exp

In [None]:
df_exp.number_of_frames.mean()

In [None]:
df_new = df[df.label!='exp']

In [None]:
df_final = pd.concat([df_new, df_exp])

In [None]:
df_final.reset_index(drop=True)

In [None]:
df_final.label.value_counts()

# frame sequences

In [None]:
df_final.groupby('label')['number_of_frames'].sum()

# distribution of labels in events, not in sequences, not in frames

In [None]:
df1 = df_final[['event_path','exposure','burial','field_joint','anode','free_span']]

In [None]:
fig1, ax1 = plt.subplots()
df1.iloc[:,1:].sum(axis=0).plot.pie(autopct='%1.1f%%',shadow=True, startangle=90,ax=ax1)
ax1.axis("equal")
plt.show()

In [None]:
df_final.to_csv('../important_csvs/more_balanced_dataset/events_with_number_of_frames_less_exp.csv', index=False)

In [None]:
df_final['number_of_frames'].hist(by=df.label)
plt.show()

# stratification

In [None]:
df = df_final

In [None]:
df.reset_index(drop=True)

In [None]:
SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
df['fold'] = 0

In [None]:
g = df.groupby('label')

In [None]:
i = g['number_of_frames'].quantile([0.05, 0.25, 0.5, 0.9]).unstack()
j = g['number_of_frames'].agg(['min', 'max'])

In [None]:
pd.concat([i, j], 1)

In [None]:
values = [400, 400, 99, 103, 276]

In [None]:
i.T.plot(subplots=True)
plt.show()

In [None]:
df['stratify_group'] = 'yo'

In [None]:
labels = df.label.unique()

In [None]:
for l, v in zip(labels, values):
    df.loc[df.label==l, 'stratify_group'] = np.char.add(
        df.loc[df.label==l, 'label'].values.astype(str),
        df.loc[df.label==l, 'number_of_frames'].apply(lambda x: f'_{int(x // v)}').values.astype(str)
    )

In [None]:
df

In [None]:
for fold_number, (train_index, val_index) in enumerate(skf.split(X=df['event_path'], y=df['stratify_group'])):
    df.loc[df.iloc[val_index].index, 'fold'] = fold_number

In [None]:
df.to_csv('../important_csvs/more_balanced_dataset/more_balanced_stratified.csv', index=False)

In [None]:
df.fold.value_counts()

In [None]:
df.stratify_group.value_counts()

In [None]:
df = pd.read_csv('../important_csvs/more_balanced_dataset/more_balanced_stratified.csv')

In [None]:
df = df.sample(frac=0.5)

In [None]:
df = df.reset_index(drop=True)

In [None]:
df

In [None]:
df.fold.value_counts()

In [None]:
df.label.value_counts()

In [None]:
for i in range(len(df)):
    if (df.label[i] != 'exp'):
        df.exposure[i] = 0

In [None]:
df

In [None]:
df.to_csv('../important_csvs/more_balanced_dataset/small_set_multi_class.csv', index=False)