**Note** :

- For demonstration purposes, the code snippets provided below and in other notebooks showcase the processes using the "Impressionism" style
- The same methodologies were applied to all 15 styles in the project

# Web Scraping

## Collecting URLs 

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
import time
import re
from pandas import Series, DataFrame
import pandas as pd
from urllib.request import urlopen
from selenium.webdriver.chrome.options import Options

# Configure Chrome options to disable notifications
opt = Options()
opt.add_experimental_option('prefs', {'profile.default_content_setting_values.notifications': 1}) # Disable popup windows

# Define the URL to scrape
url = 'https://www.wikiart.org/en/paintings-by-style/impressionism?select=featured#!#filterName:featured,viewType:masonry'

# Set up the Chrome WebDriver with the specified options
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opt)
# Install and initialize Chrome WebDriver using the ChromeDriverManager
driver.get(url)  # Open the specified URL
driver.maximize_window()  # Maximize the browser window

# Close the initial popup (wait 20 seconds to ensure the popup is present)
time.sleep(20)
driver.find_element(By.CSS_SELECTOR, 'body > div:nth-child(13) > div > div > div > div > div > div > button').click()
# Locate the popup close button and click it

# Scroll down to load more content and show all the paintings on the browser
for i in range(100):
    try : 
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)  # Scroll to the end of the page
        time.sleep(2)
        driver.find_element(By.XPATH, '/html/body/div[2]/div[1]/section/main/div[3]/div/div/div[2]/a/span[3]').click()
        # Click the "Load More" button to load additional content
        time.sleep(2)
    except : 
        pass

# Extract the HTML source code after scrolling
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')  # Parse the HTML source using BeautifulSoup
soup = soup.select('li > div > a.artwork-name.ng-binding')  # Select specific elements using CSS selectors

# Collect URLs of the artworks
ip_url = []
for i in soup:
    ip_url.append('https://www.wikiart.org' + i.attrs['href'])
# Create a list of artwork URLs by appending the base URL and href attribute

In [3]:
# Print the number of collected URLs

print(len(ip_url))  # Total number of collected URLs (should be 3600)

3600


## Saving and Loading the URL list

In [5]:
import pickle

# Save the list of URLs to a pickle file for later use
file = open("c:/data/ip_url.txt", "wb")
pickle.dump(ip_url, file)  # Dump the list to the pickle file
file.close()  # Close the file

In [2]:
import pickle

# Load the list of URLs from the pickle file
file = open("c:/data/ip_url.txt", "rb")
ip_url = pickle.load(file)
file.close()

<br>

## Creating a Dataset 

- Created a dataset using the URL list for the impressionism style

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
import time
import re
from pandas import Series, DataFrame
import pandas as pd
from urllib.request import urlopen
from selenium.webdriver.chrome.options import Options
from urllib import parse
import pickle

# Initialize an empty DataFrame for the collected data
dataset_ip = DataFrame(columns=['Title', 'Painter', 'Original Title', 'Date', 'Style', 'Period', 'Series', 'Genre', 'Media', 'Location', 'Dimensions', 'Img_url', 'Tags'])
failed_url_ip = []  # Store URLs that fail to be scraped

# Loop through the list of URLs and scrape data
for n in range(len(ip_url)):
    try:
        dic = {}  # Create a dictionary to store scraped data
        scheme, netloc, path, query, fragment = parse.urlsplit(ip_url[n]) # For failed links because they have French alphabets 
        path = parse.quote(path) # For failed links because they have French alphabets 
        link = parse.urlunsplit((scheme, netloc, path, query, fragment)) # For failed links because they have French alphabets 
        html = urlopen(link)  # Open the URL and retrieve the HTML content
        soup = BeautifulSoup(html, 'html.parser')  # Parse the HTML content using BeautifulSoup
        
        # Extract specific data from the HTML structure
        dic['Title'] = soup.select_one('article > h3').text.strip()
        dic['Painter'] = soup.select_one('article > h5').text.strip()
       
        # Extract additional data from list items
        for i in soup.select('article > ul > li'):
            try:
                key = i.text.strip().replace('\n', '').split(':')[0]
                value = i.text.strip().replace('\n', '').split(':')[1].strip()
                if key not in dic.keys():
                    dic[key] = value
            except:
                continue
           
        dic['Img_url'] = soup.select_one('aside > div > img').attrs['src']
        dic['Tags'] = ', '.join([i.text.strip() for i in soup.select('a.tags-cheaps__item__ref')])  # Extract tags
        
        # Create a temporary DataFrame and concatenate it with the main dataset
        temp = DataFrame(Series(dic)).transpose()
        dataset_ip = pd.concat([dataset_ip, temp], ignore_index=True)
        # print(f'Successfully collected info for painting number {n}')  # Print success message
    
    except:
        failed_url_ip.append(ip_url[n])  # Store failed URLs in the list
        # print(f'Failed to collect info for painting number {n}')  # Print failure message

In [10]:
# Failed to collect info for painting number 2695

failed_url_ip

['https://www.wikiart.org/en/claude-monet/landscape-on-the-ile-saint-martin']

- Rerun the failed_url to add it to the dataset

In [30]:
n = 0  # Initialize a counter for the number of URLs processed

# Loop through each URL in the failed_url_ip list
for i in failed_url_ip:
    try:
        dic = {}  # Initialize an empty dictionary to store data
        
        # Split the URL into its components
        scheme, netloc, path, query, fragment = parse.urlsplit(i)
        # Quote the path to handle any special characters
        path = parse.quote(path)
        # Reconstruct the URL with the quoted path
        link = parse.urlunsplit((scheme, netloc, path, query, fragment))
        
        # Open the URL and parse the HTML
        html = urlopen(link)
        soup = BeautifulSoup(html,'html.parser')
        
        # Extract the title and painter's name and store them in the dictionary
        dic['Title'] = soup.select_one('article > h3').text.strip()
        dic['Painter'] = soup.select_one('article > h5').text.strip()
        
        # Extract additional information from the article's list items
        for i in soup.select('article > ul > li'):
            try:
                key = i.text.strip().replace('\n', '').split(':')[0]  # Extract the key (before the colon)
                value = i.text.strip().replace('\n', '').split(':')[1].strip()  # Extract the value (after the colon)
                # Check if the key is not already in the dictionary, then add the key-value pair
                if key not in dic.keys():
                    dic[key] = value
            except:
                # Continue to the next iteration if there's an error in extracting key-value pair
                continue
        
        # Extract the image URL and tags, and store them in the dictionary
        dic['Img_url'] = soup.select_one('aside > div > img').attrs['src']
        dic['Tags'] = ', '.join([i.text.strip() for i in soup.select('a.tags-cheaps__item__ref')])  # Extract tags
        
        # Convert the dictionary to a DataFrame and append it to the existing dataset
        temp = DataFrame(Series(dic)).transpose()
        dataset_ip = pd.concat([dataset_ip,temp],ignore_index=True)
        
        # Print success message
        print(f'Successfully collected info for painting number {n}')
        
    except:
        # Print error message if there's an exception during data collection
        print(f'Failed to collect info for painting number {n}')
    
    # Increment the counter
    n += 1

Successfully collected info for painting number 0


In [49]:
dataset_ip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Title           3600 non-null   object 
 1   Painter         3600 non-null   object 
 2   Original Title  343 non-null    object 
 3   Date            3557 non-null   object 
 4   Style           3600 non-null   object 
 5   Period          617 non-null    object 
 6   Series          76 non-null     object 
 7   Genre           3600 non-null   object 
 8   Media           2503 non-null   object 
 9   Location        2170 non-null   object 
 10  Dimensions      1333 non-null   object 
 11  Img_url         3600 non-null   object 
 12  Tags            3288 non-null   object 
 13  Share           0 non-null      float64
dtypes: float64(1), object(13)
memory usage: 393.9+ KB


In [38]:
dataset_ip.head()

Unnamed: 0,Title,Painter,Original Title,Date,Style,Period,Series,Genre,Media,Location,Dimensions,Img_url,Tags,Share
0,"The Lake, Petworth, Sunset; Sample Study",J.M.W. Turner,,c.1827 - c.1828,"Romanticism,Impressionism",,,"cloudscape,sketch and study",,"Tate Britain, London, UK",,https://uploads5.wikiart.org/00246/images/will...,,
1,Portrait of the Artist by Himself,Johan Jongkind,,c.1850,Impressionism,,,self-portrait,watercolor,"Musée d'Orsay, Paris, France",20.5 x 17 cm,https://uploads4.wikiart.org/00283/images/joha...,,
2,Landscape at Valery-sur-Somme,Edgar Degas,,1854,Impressionism,,,landscape,"oil,canvas",Private Collection,,https://uploads3.wikiart.org/images/edgar-dega...,"Tree, mills-and-windmills, Sky",
3,Self-portrait,Edgar Degas,,1854,Impressionism,,,self-portrait,,"Louvre, Paris, France",,https://uploads7.wikiart.org/images/edgar-dega...,"Gentleman, Male",
4,Self Portrait,Edgar Degas,,1855,Impressionism,,,self-portrait,"oil,canvas","Musée d'Orsay, Paris, France",,https://uploads1.wikiart.org/images/edgar-dega...,"Gentleman, Male",


In [35]:
# Save the list of failed URLs to a pickle file
file = open("c:/data/failed_url_pi.txt", "wb")
pickle.dump(failed_url_ip, file)
file.close()

In [36]:
# Save the collected dataset to a CSV file
dataset_ip.to_csv('c:/data/dataset_ip.csv', index=False)

In [42]:
# Load the CSV

import pandas as pd

dataset_ip = pd.read_csv("c:/data/dataset_ip.csv") 

<br>

# Downloading Painting Images with Image URLs

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
import time
import re
from pandas import Series,DataFrame
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib.request as req

failed_img_url_ip = []  # List to store URLs of images that fail to download
n = 0  # Initialize counter for tracking the number of images processed

# Loop through each image URL in the dataset
for i in dataset_ip.Img_url:
    try:
        # Split the URL into its components
        scheme, netloc, path, query, fragment = parse.urlsplit(i)
        
        # Quote the path to handle any special characters
        path = parse.quote(path)
        
        # Reconstruct the URL with the quoted path
        link = parse.urlunsplit((scheme, netloc, path, query, fragment))
        
        # Retrieve the image from the URL and save it to the specified location on the local machine
        req.urlretrieve(link, f'C:/art/impressionism/ip_{n}.jpg')
        # print(f'Successfully saved the {n}th image')
        
    except:
        # If there's an exception (e.g., image fails to download), add the URL to the failed_img_url_ip list
        failed_img_url_ip.append(i)
         # print(f'Failed to save the {n}th image...')
    
    # Increment the counter
    n += 1

In [45]:
len(failed_img_url_ip)

0

In [46]:
import pickle

file = open("c:/data/failed_img_url_ip.txt","wb")
pickle.dump(failed_img_url_ip, file)
file.close()

<br>

# Preprocessing Data with shutil.move

- In the initial preprocessing, we excluded multi-style images, sketches, sculptures, and illustrations

In [None]:
! pip install shutils

In [47]:
import pandas as pd

dataset_ip = pd.read_csv("c:/data/dataset_ip.csv") 

# Extracting index of multi-style paintings, sketches, sculptures, and illustrations
index = list(dataset_ip[dataset_ip.Style.str.contains(',') |
                        dataset_ip.Genre.str.contains('sketch') | 
                        dataset_ip.Genre.str.contains('sculpture') | 
                        dataset_ip.Genre.str.contains('illustration')].index)
len(index)

204

In [48]:
import shutil

# Loop through the specified file paths (constructed using list comprehension and based on provided indices)
for j in [f'c:/art/impressionism/ip_{i}.jpg' for i in index]:
    # Move each image file from its current directory to a new directory (ip_delete_1) on the Desktop
    shutil.move(j, 'C:/Users/koohy/Desktop/ip_delete_1')

<br>

**Note** :

- After this, we removed all black and white images as well as non-square paintings during the project (but not in the demonstration)
- Additionally, we cropped the frames from the images
- Lastly, we narrowed down the styles from 15 to 7, selecting those that not only had a greater number of paintings but also achieved the highest accuracy with the model