In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage
import pandas as pd
from torch import optim
import re

from utils import *

In [2]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [3]:
im_count = df[df.Id != 'new_whale'].Id.value_counts()

In [4]:
im_count.name = 'sighting_count'

In [5]:
df.describe()

Unnamed: 0,Image,Id
count,25361,25361
unique,25361,5005
top,4cbf6162f.jpg,new_whale
freq,1,9664


In [6]:
df = df.join(im_count, on='Id')

In [7]:
df.describe()

Unnamed: 0,sighting_count
count,15697.0
mean,9.910875
std,12.876116
min,1.0
25%,2.0
50%,5.0
75%,12.0
max,73.0


In [9]:
df.head()

Unnamed: 0,Image,Id,sighting_count
0,0000e88ab.jpg,w_f48451c,14.0
1,0001f9222.jpg,w_c3d896a,4.0
2,00029d126.jpg,w_20df2c5,4.0
3,00050a15a.jpg,new_whale,
4,0005c1ef8.jpg,new_whale,


In [28]:
val_fns = set(df.sample(frac=1)[(df.Id != 'new_whale') & (df.sighting_count > 1)].groupby('Id').first().Image)

  """Entry point for launching an IPython kernel.


In [29]:
len(val_fns)

2931

In [31]:
pd.to_pickle(val_fns, '../data/val_fns.pkl')
val_fns = pd.read_pickle('../data/val_fns.pkl')

In [32]:
df

Unnamed: 0,Image,Id,sighting_count
0,0000e88ab.jpg,w_f48451c,14.0
1,0001f9222.jpg,w_c3d896a,4.0
2,00029d126.jpg,w_20df2c5,4.0
3,00050a15a.jpg,new_whale,
4,0005c1ef8.jpg,new_whale,
5,0006e997e.jpg,new_whale,
6,000a6daec.jpg,w_dd88965,16.0
7,000f0f2bf.jpg,new_whale,
8,0016b897a.jpg,w_64404ac,5.0
9,001c1ac5f.jpg,w_a6f9d33,2.0


In [35]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}

In [36]:
SZ = 224
BS = 64
NUM_WORKERS = 0
SEED=0

In [37]:
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [38]:
df = df[df.Id != 'new_whale']

In [39]:
df.shape

(15697, 3)

In [40]:
df.sighting_count.max()

73.0

In [41]:
df_val = df[df.Image.isin(val_fns)]
df_train = df[~df.Image.isin(val_fns)]
df_train_with_val = df

In [42]:
df_val.shape, df_train.shape, df_train_with_val.shape

((2931, 3), (12766, 3), (15697, 3))

In [44]:
df_train.groupby('Id').head()

Unnamed: 0,Image,Id,sighting_count
0,0000e88ab.jpg,w_f48451c,14.0
1,0001f9222.jpg,w_c3d896a,4.0
6,000a6daec.jpg,w_dd88965,16.0
8,0016b897a.jpg,w_64404ac,5.0
9,001c1ac5f.jpg,w_a6f9d33,2.0
16,00355ff28.jpg,w_cb622a2,5.0
21,00442c882.jpg,w_8cad422,6.0
23,004775679.jpg,w_13ae3d4,3.0
26,004e8ad5b.jpg,w_3de579a,54.0
27,004f87702.jpg,w_1d0830e,11.0


In [49]:
for grp in df_train.groupby('Id'):
    print(grp)
    print(grp[1].)
    break

('w_0003639',                Image         Id  sighting_count
13008  833675975.jpg  w_0003639             1.0)
               Image         Id  sighting_count
13008  833675975.jpg  w_0003639             1.0


In [14]:
%%time

res = None
sample_to = 15

for grp in df_train.groupby('Id'):
    n = grp[1].shape[0]
    additional_rows = grp[1].sample(0 if sample_to < n  else sample_to - n, replace=True)
    rows = pd.concat((grp[1], additional_rows))
    
    if res is None: res = rows
    else: res = pd.concat((res, rows))

CPU times: user 18.6 s, sys: 14.1 ms, total: 18.6 s
Wall time: 18.6 s


In [15]:
%%time

res_with_val = None
sample_to = 15

for grp in df_train_with_val.groupby('Id'):
    n = grp[1].shape[0]
    additional_rows = grp[1].sample(0 if sample_to < n  else sample_to - n, replace=True)
    rows = pd.concat((grp[1], additional_rows))
    
    if res_with_val is None: res_with_val = rows
    else: res_with_val = pd.concat((res_with_val, rows))

CPU times: user 18.9 s, sys: 15.6 ms, total: 18.9 s
Wall time: 18.9 s


In [16]:
res.shape, res_with_val.shape

((76174, 3), (76287, 3))

In [17]:
pd.concat((res, df_val))[['Image', 'Id']].to_csv('../data/oversampled_train.csv', index=False)
res_with_val[['Image', 'Id']].to_csv('../data/oversampled_train_and_val.csv', index=False)