## Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from PIL import Image, ImageFilter
import os
import cv2

## Building the Data Frame

In [2]:
df = pd.read_csv("all_data_info.csv")
df.head()

Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename
0,Barnett Newman,1955.0,abstract,15530.0,6911.0,9201912.0,wikiart,Color Field Painting,Uriel,train_only,True,102257.jpg
1,Barnett Newman,1950.0,abstract,14559.0,6866.0,8867532.0,wikiart,Color Field Painting,Vir Heroicus Sublimis,train_only,True,75232.jpg
2,kiri nichol,2013.0,,9003.0,9004.0,1756681.0,,Neoplasticism,,test_only,False,32145.jpg
3,kiri nichol,2013.0,,9003.0,9004.0,1942046.0,,Neoplasticism,,test_only,False,20304.jpg
4,kiri nichol,2013.0,,9003.0,9004.0,1526212.0,,Neoplasticism,,test_only,False,836.jpg


Below we check out what we're working with - how many images belong to which style- and genre- respectively. We are interested in generating portraits of closely-related style. This will hopefully lead to StyleGAN producing more coherent results.

In [3]:
dict(df["style"].value_counts())

{'Impressionism': 10643,
 'Realism': 10523,
 'Romanticism': 9285,
 'Expressionism': 7013,
 'Post-Impressionism': 5778,
 'Art Nouveau (Modern)': 4899,
 'Baroque': 4400,
 'Surrealism': 4167,
 'Symbolism': 3476,
 'Rococo': 2733,
 'Northern Renaissance': 2379,
 'Naïve Art (Primitivism)': 2295,
 'Abstract Expressionism': 2074,
 'Neoclassicism': 2038,
 'Cubism': 1747,
 'Ukiyo-e': 1426,
 'Early Renaissance': 1351,
 'Mannerism (Late Renaissance)': 1342,
 'High Renaissance': 1314,
 'Art Informel': 1267,
 'Magic Realism': 1002,
 'Abstract Art': 979,
 'Academicism': 972,
 'Color Field Painting': 910,
 'Pop Art': 791,
 'Fauvism': 731,
 'Lyrical Abstraction': 670,
 'Art Deco': 644,
 'Concretism': 581,
 'Neo-Romanticism': 566,
 'Ink and wash painting': 545,
 'Op Art': 528,
 'Pointillism': 501,
 'Minimalism': 460,
 'Tachisme': 436,
 'Naturalism': 435,
 'Neo-Expressionism': 420,
 'Orientalism': 392,
 'Luminism': 385,
 'Shin-hanga': 380,
 'Hard Edge Painting': 372,
 'Sōsaku hanga': 369,
 'Divisionism':

In [4]:
df["genre"].value_counts()

portrait                    16847
landscape                   15006
genre painting              14260
abstract                     9498
religious painting           7429
cityscape                    5348
sketch and study             3644
illustration                 3202
still life                   3132
symbolic painting            2545
nude painting (nu)           2290
figurative                   2244
design                       2024
mythological painting        1910
marina                       1805
flower painting              1606
animal painting              1571
self-portrait                1531
allegorical painting         1034
history painting              879
interior                      670
literary painting             558
battle painting               358
wildlife painting             327
poster                        286
capriccio                     236
veduta                        233
caricature                    231
cloudscape                    208
tessellation  

Here we calculate the approximative final size of our data set using some logical operators.

In [5]:
df_is_portrait = df["genre"] == "portrait"
df_is_portrait.value_counts()

False    86403
True     16847
Name: genre, dtype: int64

In [6]:
df_is_style = (df["style"] == "Post-Impressionism") + (df["style"] == "Impressionism") + (df["style"] == "Expressionism")
df_is_style.value_counts()

False    79816
True     23434
Name: style, dtype: int64

In [7]:
fin_crit = df_is_portrait * df_is_style
fin_crit.value_counts()

False    99257
True      3993
dtype: int64

The data frame below is the final dataset that we've concocted. It contains datapoints that corespond to portraits in the style of Impressionism, Post-Impressionism and Expressionism, which have similar characteristics.

In [8]:
df = df[fin_crit].reset_index(drop=True)
df.head()

Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename
0,Valentin Serov,1887.0,portrait,3555.0,4000.0,7852273.0,wikiart,Impressionism,Girl with Peaches,train_and_test,True,5241.jpg
1,Ferdinand Hodler,1884,portrait,2964.0,4536.0,8758806.0,wikiart,Impressionism,Seated bearded man,train_and_test,True,99956.jpg
2,Edouard Vuillard,c.1911,portrait,2874.0,4320.0,6888032.0,wikiart,Post-Impressionism,Half-figure of a seated woman,train_and_test,True,5694.jpg
3,Giuseppe de Nittis,,portrait,3048.0,3872.0,1898410.0,wikiart,Impressionism,A Lady from Naples,train_only,True,54668.jpg
4,Ernst Ludwig Kirchner,1913,portrait,2952.0,3996.0,3896710.0,wikiart,Expressionism,Erna Japanschirm,train_and_test,True,60691.jpg


## Building the Dataset

The $\verb|make_square|$ function takes in three variables - the name of the image folder, the name of the image and the name of the folder for the image to be saved in. The function takes in the image from the original WikiArt dataset and makes it square by reflecting it on two diferent sides (since StyleGAN requires all input data be of the same size). I found this way to be the best, since it looks more natural and aesthetically pleasing than having two empty stripes on either side, or distorting the image to fit a square. 

In [9]:
def make_square(image_folder, image_name, save_folder):
    # is_vertical checks if image is vertical or horizontal, this is used to know on which axis to extend the image     
    def is_vertical(img):
        x, y = img.size
        if y > x:
            return True
        else:
            return False
    # convert image to numpy array    
    img = Image.open(image_folder + "//" + image_name).convert("RGB")
    cv_img = np.array(img)[:, :, ::-1].copy()
    
    if is_vertical(img):
        # below we define and calculate the size of the stripes to be reflected         
        left = (img.size[1] - img.size[0]) // 2
        right = img.size[1] - img.size[0] - left
        # the copyMakeBorder function from the cv library then does the job for us.        
        new_img = cv2.copyMakeBorder(cv_img, 0, 0, left, right, cv2.BORDER_REFLECT)
        # resize and convert the image to a format which can be saved         
        img = cv2.cvtColor(new_img, cv2.COLOR_BGR2RGB)
        im_pil = Image.fromarray(img).resize((512, 512))
        im_pil.save(save_folder + "//" + image_name)
    else:
        # analogous, except image must be reflecred on the y axis
        top = (img.size[0] - img.size[1]) // 2
        bottom = img.size[0] - img.size[1] - top
        new_img = img_with_border = cv2.copyMakeBorder(cv_img, top, bottom, 0, 0, cv2.BORDER_REFLECT)
        img = cv2.cvtColor(new_img, cv2.COLOR_BGR2RGB)
        im_pil = Image.fromarray(img).resize((512, 512))
        im_pil.save(save_folder + "//" + image_name)

Now we apply $\verb|make_square|$ on each data-point and save each image separated by style.

In [11]:
folder_images = "train"
for row in range(len(df)):
    try:
        make_square(folder_images, df.loc[row]["new_filename"], "Portraits_dataset//" + df.loc[row]["style"])
    except:
        # some images might be too large or too small which throws out an error        
        print(df.loc[row]["new_filename"])

95010.jpg
9989.jpg
