Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove pandas.append to resolve issue #102 #104

Merged
merged 2 commits into from
Apr 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 120 additions & 74 deletions pylabel/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,150 +3,196 @@
from sklearn.model_selection import GroupShuffleSplit as sklearnGroupShuffleSplit
from pylabel.shared import schema

class Split():
def __init__(self, dataset=None):

class Split:
def __init__(self, dataset=None):
self.dataset = dataset

def UnSplit(self):
"""Unsplit the dataset by setting all values of the split column to null."""
self.dataset.df["split"] = np.nan

def GroupShuffleSplit(self, train_pct=.5, test_pct=.25, val_pct=.25, group_col = 'img_filename', random_state=None):
def GroupShuffleSplit(
self,
train_pct=0.5,
test_pct=0.25,
val_pct=0.25,
group_col="img_filename",
random_state=None,
):
"""
This function uses the GroupShuffleSplit command from sklearn. It can split into 3 groups (train,
test, and val) by applying the command twice.
test, and val) by applying the command twice.
"""
df_main = self.dataset.df
gss = sklearnGroupShuffleSplit(n_splits=1, train_size=train_pct)
train_indexes, test_indexes = next(gss.split(X=df_main, y=df_main[group_col], groups=df_main.index.values))
train_indexes, test_indexes = next(
gss.split(X=df_main, y=df_main[group_col], groups=df_main.index.values)
)

df_main.loc[train_indexes,'split'] = "train"
df_main.loc[test_indexes,'split'] = "test"
df_main.loc[train_indexes, "split"] = "train"
df_main.loc[test_indexes, "split"] = "test"
self.dataset.df = df_main

if val_pct:
df_train = df_main.loc[df_main['split'] == 'train']
df_test = df_main.loc[df_main['split'] == 'test']
df_train = df_main.loc[df_main["split"] == "train"]
df_test = df_main.loc[df_main["split"] == "test"]
df_test = df_test.reset_index()
second_split_pct = float(test_pct/(test_pct+val_pct))
second_split_pct = float(test_pct / (test_pct + val_pct))
gss2 = sklearnGroupShuffleSplit(n_splits=1, train_size=second_split_pct)
test_indexes_2, val_indexes_2 = next(gss2.split(X=df_test, y=df_test[group_col], groups=df_test.index.values))
df_test.loc[test_indexes_2,'split'] = "test"
df_test.loc[val_indexes_2,'split'] = "val"
self.dataset.df = df_train.append(df_test)
test_indexes_2, val_indexes_2 = next(
gss2.split(X=df_test, y=df_test[group_col], groups=df_test.index.values)
)
df_test.loc[test_indexes_2, "split"] = "test"
df_test.loc[val_indexes_2, "split"] = "val"
self.dataset.df = pd.concat([df_train, df_test])
self.dataset.df = self.dataset.df.reset_index(drop=True)
self.dataset.df = self.dataset.df[schema]

#Written with the help of https://stackoverflow.com/questions/56872664/complex-dataset-split-stratifiedgroupshufflesplit
def StratifiedGroupShuffleSplit(self, train_pct=.7, test_pct=.3, val_pct=.0, weight=0.01,
group_col = 'img_filename', cat_col = 'cat_name', batch_size=1):
# Written with the help of https://stackoverflow.com/questions/56872664/complex-dataset-split-stratifiedgroupshufflesplit
def StratifiedGroupShuffleSplit(
self,
train_pct=0.7,
test_pct=0.3,
val_pct=0.0,
weight=0.01,
group_col="img_filename",
cat_col="cat_name",
batch_size=1,
):
"""
This function will 'split" the dataframe by setting the split collumn equal to
This function will 'split" the dataframe by setting the split collumn equal to
train, test, or val. When a split dataset is exported the annotations will be split into
seperate groups so that can be used used in model training, testing, and validation.
"""
df_main = self.dataset.df
df_main = df_main.reindex(np.random.permutation(df_main.index)) # shuffle dataset
df_main = df_main.reindex(
np.random.permutation(df_main.index)
) # shuffle dataset

# create empty train, val and test datasets
df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()

subject_grouped_df_main = df_main.groupby([group_col], sort=False, as_index=False)
category_grouped_df_main = df_main.groupby(cat_col).count()[[group_col]]/len(df_main)*100

#Check inputs
assert (0 <= weight <= 1), "Weight must be between 0 and 1"
total_splits = round((train_pct) + float(test_pct) + float(val_pct),1)
assert (total_splits == 1), "Sum of train_pct, test_pct, and val_pct must equal 1."
assert (batch_size >= 1 and batch_size <= subject_grouped_df_main.ngroups / 10 ), \
"Batch must be greater than 1 and less than 1/10 count of groups"
subject_grouped_df_main = df_main.groupby(
[group_col], sort=False, as_index=False
)
category_grouped_df_main = (
df_main.groupby(cat_col).count()[[group_col]] / len(df_main) * 100
)

# Check inputs
assert 0 <= weight <= 1, "Weight must be between 0 and 1"
total_splits = round((train_pct) + float(test_pct) + float(val_pct), 1)
assert (
total_splits == 1
), "Sum of train_pct, test_pct, and val_pct must equal 1."
assert (
batch_size >= 1 and batch_size <= subject_grouped_df_main.ngroups / 10
), "Batch must be greater than 1 and less than 1/10 count of groups"

def calc_mse_loss(df):
grouped_df = df.groupby(cat_col).count()[[group_col]]/len(df)*100
df_temp = category_grouped_df_main.join(grouped_df, on = cat_col, how = 'left', lsuffix = '_main')
grouped_df = df.groupby(cat_col).count()[[group_col]] / len(df) * 100
df_temp = category_grouped_df_main.join(
grouped_df, on=cat_col, how="left", lsuffix="_main"
)
df_temp.fillna(0, inplace=True)
df_temp['diff'] = (df_temp['img_filename_main'] - df_temp[group_col])**2
mse_loss = np.mean(df_temp['diff'])
df_temp["diff"] = (df_temp["img_filename_main"] - df_temp[group_col]) ** 2
mse_loss = np.mean(df_temp["diff"])
return mse_loss

i = 0 #counter for all items in dataset
b = 0 #counter for the batches
i = 0 # counter for all items in dataset
b = 0 # counter for the batches
batch_df = df_main[0:0]

for _, group in subject_grouped_df_main:
if (i < 3):
if (i == 0):
df_train = df_train.append(pd.DataFrame(group), ignore_index=True)
if i < 3:
if i == 0:
df_train = pd.concat(
[df_train, pd.DataFrame(group)], ignore_index=True
)
i += 1
continue
elif (i == 1):
df_val = df_val.append(pd.DataFrame(group), ignore_index=True)
elif i == 1:
df_val = pd.concat([df_val, pd.DataFrame(group)], ignore_index=True)
i += 1
continue
else:
df_test = df_test.append(pd.DataFrame(group), ignore_index=True)
df_test = pd.concat(
[df_test, pd.DataFrame(group)], ignore_index=True
)
i += 1
continue

#Add groups to the
batch_df = batch_df.append(group)
# Add groups to the
batch_df = pd.concat([batch_df, group])
b += 1
if b < batch_size and i < subject_grouped_df_main.ngroups-3:
if b < batch_size and i < subject_grouped_df_main.ngroups - 3:
i += 1
continue

mse_loss_diff_train = calc_mse_loss(df_train) - calc_mse_loss(df_train.append(batch_df, ignore_index=True))
mse_loss_diff_val = calc_mse_loss(df_val) - calc_mse_loss(df_val.append(batch_df, ignore_index=True))
mse_loss_diff_test = calc_mse_loss(df_test) - calc_mse_loss(df_test.append(batch_df, ignore_index=True))
mse_loss_diff_train = calc_mse_loss(df_train) - calc_mse_loss(
pd.concat([df_train, batch_df], ignore_index=True)
)
mse_loss_diff_val = calc_mse_loss(df_val) - calc_mse_loss(
pd.concat([df_train, batch_df], ignore_index=True)
)
mse_loss_diff_test = calc_mse_loss(df_test) - calc_mse_loss(
pd.concat([df_train, batch_df], ignore_index=True)
)

total_records = len(df_train) + len(df_val) + len(df_test)

len_diff_train = (train_pct - (len(df_train)/total_records))
len_diff_val = (val_pct - (len(df_val)/total_records))
len_diff_test = (test_pct - (len(df_test)/total_records))
len_diff_train = train_pct - (len(df_train) / total_records)
len_diff_val = val_pct - (len(df_val) / total_records)
len_diff_test = test_pct - (len(df_test) / total_records)

len_loss_diff_train = len_diff_train * abs(len_diff_train)
len_loss_diff_val = len_diff_val * abs(len_diff_val)
len_loss_diff_test = len_diff_test * abs(len_diff_test)

loss_train = (weight * mse_loss_diff_train) + ((1-weight) * len_loss_diff_train)
loss_val = (weight * mse_loss_diff_val) + ((1-weight) * len_loss_diff_val)
loss_test = (weight * mse_loss_diff_test) + ((1-weight) * len_loss_diff_test)

if (max(loss_train,loss_val,loss_test) == loss_train):
df_train = df_train.append(batch_df, ignore_index=True)
elif (max(loss_train,loss_val,loss_test) == loss_val):
df_val = df_val.append(batch_df, ignore_index=True)
loss_train = (weight * mse_loss_diff_train) + (
(1 - weight) * len_loss_diff_train
)
loss_val = (weight * mse_loss_diff_val) + ((1 - weight) * len_loss_diff_val)
loss_test = (weight * mse_loss_diff_test) + (
(1 - weight) * len_loss_diff_test
)

if max(loss_train, loss_val, loss_test) == loss_train:
df_train = pd.concat([df_train, batch_df], ignore_index=True)
elif max(loss_train, loss_val, loss_test) == loss_val:
df_val = pd.concat([df_val, batch_df], ignore_index=True)
else:
df_test = df_test.append(batch_df, ignore_index=True)
df_test = pd.concat([df_test, batch_df], ignore_index=True)

#print ("Group " + str(i) + ". loss_train: " + str(loss_train) + " | " + "loss_val: " + str(loss_val) + " | " + "loss_test: " + str(loss_test) + " | ")
# print ("Group " + str(i) + ". loss_train: " + str(loss_train) + " | " + "loss_val: " + str(loss_val) + " | " + "loss_test: " + str(loss_test) + " | ")
i += 1
#Reset the batch
# Reset the batch
b = 0
batch_df = df_main[0:0]


######
# Final prep tasks before returning the split dataframe

#Sometimes the algo will put some rows in the val set even if the split percent was set to zero
#In those cases move the rows from val to test
if round(val_pct,1) == round(0,1):
df_test.append(df_val)
df_val = df_val[0:0] #remove the values from
# Sometimes the algo will put some rows in the val set even if the split percent was set to zero
# In those cases move the rows from val to test
if round(val_pct, 1) == round(0, 1):
pd.concat([df_test, df_val])
df_val = df_val[0:0] # remove the values from

# Apply train, split, val labels to the split collumn
df_train["split"] = "train"
df_test["split"] = "test"
df_val["split"] = "val"

#Apply train, split, val labels to the split collumn
df_train['split'] = 'train'
df_test['split'] = 'test'
df_val['split'] = 'val'
df = pd.concat([df_train, pd.concat([df_test, df_val])])

df = df_train.append(df_test).append(df_val)

assert df.shape == df_main.shape, "Output shape does not match input shape. Data loss has occured."
assert (
df.shape == df_main.shape
), "Output shape does not match input shape. Data loss has occured."

self.dataset.df = df
self.dataset.df = self.dataset.df.reset_index(drop=True)
self.dataset.df = self.dataset.df[schema]
self.dataset.df = self.dataset.df[schema]
1 change: 0 additions & 1 deletion samples
Submodule samples deleted from 63b4f7
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
setup(
name="pylabel",
packages=["pylabel"],
version="0.1.48",
version="0.1.49",
description="Transform, analyze, and visualize computer vision annotations.",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down