# Table Of Content
- [Importing Libraries](#01)
- [Importing Data](#02)
- [Data Vizualization](#03)
    * [Age Histogram](#3.1)
    * [Ethnicity Count Plot](#3.2)
    * [Gender Count Plot](#3.3)
    * [Plotting Images](#3.4)
- [Data Generation](#04)
    * [Calculating Data Stats](#4.1)
    * [Train Test Split](#4.2)


**Ethnicity Labels**

ETHNICITIES = { 0: "White", 1: "Black", 2: "Asian", 3: "Indian", 4: "Hispanic" }

**Gender Labels**

GENDERS = { 0: "Male", 1: "Female" }

# Importing Libraries <a id="01"></a>

In [None]:
!pip install torchsummary

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary

from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Image processing
from PIL import Image
import cv2

# Importing Data <a id="02"></a>

In [None]:
df = pd.read_csv('../input/age-gender-and-ethnicity-face-data-csv/age_gender.csv')
df.head()

In [None]:
df.shape

In [None]:
print(f'Total Data Points: {df.shape[1]}')
print(f'Total columns/Features: {df.shape[0]}')

In [None]:
df.info()

In [None]:
type(df.pixels[0]) # since pixels are in form of string we need to convert it to an array

In [None]:
## Converting pixels into numpy array
df['pixels'] = df['pixels'].apply(lambda x:  np.reshape(np.array(x.split(), dtype="float32"), (48,48)))
df.head()

In [None]:
type(df.pixels[0]) 

# Data Vizualization <a id="03"></a>

## Age Histogram <a id="3.1"></a>

In [None]:
fig = px.histogram(df, x="age")
fig.update_layout(title_text='Age Histogram')
fig.show()

## Ethnicity Count Plot<a id="3.2"></a>

In [None]:
eth_values_to_labels = { 0: "White", 1: "Black", 2: "Asian", 3: "Indian", 4: "Hispanic" }
gender_values_to_labels = { 0: "Male", 1: "Female" }

In [None]:
df.ethnicity.value_counts()

In [None]:
fig = go.Figure([
    go.Bar(x=[eth_values_to_labels[i] for i in df.ethnicity.value_counts().index], 
           y=df.ethnicity.value_counts().values)
])
fig.update_layout(
    title_text='Count Plot Ethnicity',
    xaxis_title='Ethnicity',
    yaxis_title='Count'
)
fig.show()

## Gender Count Plot <a id="3.3"></a>

In [None]:
df.gender.value_counts()

In [None]:
fig = go.Figure([
    go.Bar(x=[gender_values_to_labels[i] for i in df.gender.value_counts().index], 
           y=df.gender.value_counts().values)
])
fig.update_layout(
    title_text='Count Plot Gender',
    xaxis_title='Gender',
    yaxis_title='Count'
)
fig.show()

## Plotting Images <a id="3.4"></a>

In [None]:
def plot_data(rows, cols, lower_value, upper_value):

    figure = plt.figure(figsize=(cols*3,rows*4))
    for i in range(1, cols*rows + 1):
        k = np.random.randint(lower_value,upper_value)
        figure.add_subplot(rows, cols, i) # adding sub plot

        gender = gender_values_to_labels[df.gender[k]]
        ethnicity = eth_values_to_labels[df.ethnicity[k]]
        age = df.age[k]
        
        im = df.pixels[k]
        # im = np.reshape(im, (48,48))
        plt.imshow(im, cmap='gray')
        plt.axis('off')
        plt.title(f'Gender:{gender}\nAge:{age}\nEthnicity:{ethnicity}')

    plt.tight_layout()
    plt.show()

In [None]:
plot_data(rows=6, cols=7, lower_value=0, upper_value=len(df))

In [None]:
plot_data(rows=1, cols=7, lower_value=0, upper_value=1000)

In [None]:
plot_data(rows=2, cols=7, lower_value=len(df)-2000, upper_value=len(df))

In [None]:
plot_data(rows=2, cols=7, lower_value=(len(df)-4000)//2, upper_value=len(df)//2)

# Data Generation <a id="04"></a>

## Calculating Data Stats <a id="4.1"></a>

In [None]:
psum, psum_sq = 0, 0
# pixel count
image_size = 48
count = len(df) * image_size * image_size

# loop through images
for img in df.pixels:
    psum += np.sum(img)
    psum_sq += np.sum(img**2)

# mean, var and std
total_mean = psum / count
total_var  = (psum_sq / count) - (total_mean ** 2)
total_std  = np.sqrt(total_var)

# output
print('[Dataset]')
print(f'- mean: {total_mean}')
print(f'- std: {total_std}')
print(f'- var: {total_var}')

In [None]:
class get_data(Dataset):
    def __init__(self, df):
        self.df = df
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=125.01632431478356, std=59.44005080507268)
        ])
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,i):
        age = df['age'][i]
        eth = df['ethnicity'][i]
        gender = df['gender'][i]
        
        im = df['pixels'][i]
#         im = np.reshape(im, (48,48))
        im = self.transform(im)
        
        age = torch.tensor(age)
        eth = torch.tensor(eth)
        gender = torch.tensor(gender)
        
        return im, age, eth, gender

## Train Test Split <a id="4.2"></a>

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=129) 

print(f'- Number of Datapoints in Training Set: {len(train)}')
print(f'- Number of Datapoints in Test Set: {len(test)}')

In [None]:
SEED = 1

# CUDA?
use_cuda = torch.cuda.is_available()
print("CUDA Available:", use_cuda)

# For reproducibility
torch.manual_seed(SEED)

if use_cuda:
    torch.cuda.manual_seed(SEED)
    BATCH_SIZE=64
else:
    BATCH_SIZE=32
    
print('BATCH_SIZE:', BATCH_SIZE)

In [None]:
kwargs = {'num_workers': 2, 'pin_memory': True} if use_cuda else {}


train_loader = DataLoader(get_data(train), batch_size=BATCH_SIZE, shuffle=True, **kwargs)
test_loader = DataLoader(get_data(test), batch_size=BATCH_SIZE, shuffle=False, **kwargs)