# Imports

In [1]:
import pandas as pd
import os
import numpy as np

# Load raw data

In [2]:
df = pd.read_csv('../data/SCUT_labels.txt', sep=' ', header=None, names=['image', 'label'])
df

Unnamed: 0,image,label
0,CF437.jpg,2.883333
1,AM1384.jpg,2.466667
2,AM1234.jpg,2.150000
3,AM1774.jpg,3.750000
4,CF215.jpg,3.033333
...,...,...
5495,AF546.jpg,1.883333
5496,AM558.jpg,2.766667
5497,AF805.jpg,2.450000
5498,AF271.jpg,3.233333


# Check for all files

In [22]:
files = os.listdir('SCUT_cropped_images')
files.__len__()

5500

In [23]:
df = df[df['image'].isin(files)]
df

Unnamed: 0,image,label
0,CF437.jpg,2.883333
1,AM1384.jpg,2.466667
2,AM1234.jpg,2.150000
3,AM1774.jpg,3.750000
4,CF215.jpg,3.033333
...,...,...
5495,AF546.jpg,1.883333
5496,AM558.jpg,2.766667
5497,AF805.jpg,2.450000
5498,AF271.jpg,3.233333


# Make gender and race columns

In [24]:
def add_race(row):
    if row['image'].startswith('A'):
        return 'asian'
    elif row['image'].startswith('C'):
        return 'caucasian'
    else:
        return 'other'
    
df['race'] = df.apply(add_race, axis=1)
assert df.race.unique().__len__() == 2
df

Unnamed: 0,image,label,race
0,CF437.jpg,2.883333,caucasian
1,AM1384.jpg,2.466667,asian
2,AM1234.jpg,2.150000,asian
3,AM1774.jpg,3.750000,asian
4,CF215.jpg,3.033333,caucasian
...,...,...,...
5495,AF546.jpg,1.883333,asian
5496,AM558.jpg,2.766667,asian
5497,AF805.jpg,2.450000,asian
5498,AF271.jpg,3.233333,asian


In [25]:
def add_gender(row):
    if row['image'][1] == 'M':
        return 'male'
    elif row['image'][1] == 'F':
        return 'female'
    else:
        return 'other'
    
df['gender'] = df.apply(add_gender, axis=1)
assert df.gender.unique().__len__() == 2
df

Unnamed: 0,image,label,race,gender
0,CF437.jpg,2.883333,caucasian,female
1,AM1384.jpg,2.466667,asian,male
2,AM1234.jpg,2.150000,asian,male
3,AM1774.jpg,3.750000,asian,male
4,CF215.jpg,3.033333,caucasian,female
...,...,...,...,...
5495,AF546.jpg,1.883333,asian,female
5496,AM558.jpg,2.766667,asian,male
5497,AF805.jpg,2.450000,asian,female
5498,AF271.jpg,3.233333,asian,female


# Scale label to 0-1

In [26]:
df['label'] = (df['label'] - df['label'].min())
df['label'] = df['label'] / df['label'].max()
df

Unnamed: 0,image,label,race,gender
0,CF437.jpg,0.500000,caucasian,female
1,AM1384.jpg,0.388393,asian,male
2,AM1234.jpg,0.303571,asian,male
3,AM1774.jpg,0.732143,asian,male
4,CF215.jpg,0.540178,caucasian,female
...,...,...,...,...
5495,AF546.jpg,0.232143,asian,female
5496,AM558.jpg,0.468750,asian,male
5497,AF805.jpg,0.383929,asian,female
5498,AF271.jpg,0.593750,asian,female


In [27]:
df.describe()

Unnamed: 0,label
count,5500.0
mean,0.52881
std,0.184316
min,0.0
25%,0.397321
50%,0.486607
75%,0.674107
max,1.0


# Save the cleaned data

In [None]:
df.to_csv('../data/SCUT.csv', index=False)