In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [3]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

# From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
sparql.setQuery("""
SELECT ?teamLabel ?wikipediaLink 
WHERE {
    ?team wdt:P118 wd:Q9448;
          wdt:P31 wd:Q476028.
    # Retrieve the English Wikipedia link
    OPTIONAL {
      ?wikipediaLink schema:about ?team;
      schema:isPartOf <https://en.wikipedia.org/>.
      }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
ORDER BY ?teamLabel
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [4]:
team_labels = [team['teamLabel']['value'].replace(" ", "_") for team in results['results']['bindings']]
team_urls = [team['wikipediaLink']['value'] for team in results['results']['bindings']]
 
teams = list(zip(team_urls, team_labels))

In [14]:
import wikipediaapi

# Initialize the Wikipedia API for the English language
wiki_wiki = wikipediaapi.Wikipedia('KitChaser (dominic.mccaskill@gmail.com)', 'en')


def get_images_from_page(title):
    page = wiki_wiki.page(title)
    
    if not page.exists():
        print(f"Page {title} does not exist.")
        return []
    
    # Wikipedia API provides the list of images directly
    images = page.images
    # Filter images that start with "Kit"
    kit_images = [img for img in images if img.split('/')[-1].startswith('Kit')]
    return kit_images

# Loop through the list of page titles and get the "Kit" images
for title in team_labels:
    kit_images = get_images_from_page(title)
    if kit_images:
        print(f"Images that start with 'Kit' on {title}:")
        for img in kit_images:
            print(img)
    else:
        print(f"No images starting with 'Kit' found on {title}.")


KeyboardInterrupt: 

In [20]:
import mwclient

# Connect to the English Wikipedia
site = mwclient.Site('en.wikipedia.org')

def get_images_from_page(title):
    page = site.pages[title]
    
    # Get all images on the page
    images = page.images()
    
    # Filter images starting with "Kit"
    kit_images = []
    for img in images:
        if 'Kit' in img.name:
            # Construct the full image URL
            try:
                image_url = f"https://{site.host}{img.imageinfo['url']}"
                kit_images.append(image_url)
            except:
                pass
    
    return kit_images

# Loop through the list of page titles and get the "Kit" images
for title in team_labels:
    kit_images = get_images_from_page(title)
    if kit_images:
        print(f"Images that start with 'Kit' on {title}:")
        for img in kit_images:
            print(img)
    else:
        print(f"No images starting with 'Kit' found on {title}.")


Images that start with 'Kit' on AFC_Bournemouth:
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/8/83/Kit_body.svg
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/b/b0/Kit_body_bournemouth2425h.png
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/7/7f/Kit_left_arm.svg
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/4/4f/Kit_left_arm_bournemouth2425h.png
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/c/cd/Kit_right_arm.svg
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/0/01/Kit_right_arm_bournemouth2425h.png
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/a/af/Kit_shorts.svg
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/e/ec/Kit_shorts_bournemouth2425h.png
https://en.wikipedia.orghttps://upload.wikimedia.org/wikipedia/commons/9/9b/Kit_socks_long.svg
https://en.wikipedia.orghttps://upload.wikimedia.org/w

KeyboardInterrupt: 

In [22]:
import requests
from bs4 import BeautifulSoup

def get_kit_images_from_page(url):
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <img> tags
    img_tags = soup.find_all('img')
    
    # Filter images whose 'src' attribute starts with "Kit"
    kit_images = []
    for img in img_tags:
        src = img.get('src')
        if src and 'Kit' in src.split('/')[-1]:  # Check if the image filename starts with "Kit"
            # Wikipedia's image URLs start with "//", so we need to add "https:"
            full_img_url = 'https:' + src
            kit_images.append(full_img_url)
    
    return kit_images

# Loop through the list of Wikipedia URLs and get the "Kit" images in order
for url in team_urls:
    kit_images = get_kit_images_from_page(url)
    if kit_images:
        print(f"Images that start with 'Kit' on {url}:")
        for img in kit_images:
            print(img)
    else:
        print(f"No images starting with 'Kit' found on {url}.")


Images that start with 'Kit' on https://en.wikipedia.org/wiki/AFC_Bournemouth:
https://upload.wikimedia.org/wikipedia/commons/4/4f/Kit_left_arm_bournemouth2425h.png
https://upload.wikimedia.org/wikipedia/commons/thumb/7/7f/Kit_left_arm.svg/31px-Kit_left_arm.svg.png
https://upload.wikimedia.org/wikipedia/commons/b/b0/Kit_body_bournemouth2425h.png
https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Kit_body.svg/38px-Kit_body.svg.png
https://upload.wikimedia.org/wikipedia/commons/0/01/Kit_right_arm_bournemouth2425h.png
https://upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Kit_right_arm.svg/31px-Kit_right_arm.svg.png
https://upload.wikimedia.org/wikipedia/commons/e/ec/Kit_shorts_bournemouth2425h.png
https://upload.wikimedia.org/wikipedia/commons/thumb/a/af/Kit_shorts.svg/100px-Kit_shorts.svg.png
https://upload.wikimedia.org/wikipedia/commons/f/f3/Kit_socks_redtop.png
https://upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Kit_socks_long.svg/100px-Kit_socks_long.svg.png
Images t

KeyboardInterrupt: 

In [25]:
import requests
from bs4 import BeautifulSoup

# List of Wikipedia page URLs
page_urls = [
    'https://en.wikipedia.org/wiki/Python_(programming_language)',
    'https://en.wikipedia.org/wiki/JavaScript',
    'https://en.wikipedia.org/wiki/Artificial_intelligence'
]

def get_kit_images_from_page(url):
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <td> tags that do not contain other <td> tags
    td_tags = [td for td in soup.find_all('td') if not td.find('td')]

    grouped_images = []

    # Loop through each <td> and find images within it
    for td in td_tags:
        images_in_td = []
        
        # Find all <img> tags within this <td>
        img_tags = td.find_all('img')
        
        for img in img_tags:
            src = img.get('src')
            if src and 'Kit' in src.split('/')[-1]:  # Check if the image filename starts with "Kit"
                full_img_url = 'https:' + src
                images_in_td.append(full_img_url)
        
        # Only add this group if we found any "Kit" images
        if images_in_td:
            grouped_images.append(images_in_td)

    return grouped_images

# Loop through the list of Wikipedia URLs and get the "Kit" images grouped by <td> tags
for url in team_urls:
    grouped_images = get_kit_images_from_page(url)
    if grouped_images:
        print(f"Grouped images that start with 'Kit' on {url}:")
        for i, group in enumerate(grouped_images, start=1):
            print(f"Group {i}:")
            for img in group:
                print(img)
    else:
        print(f"No images starting with 'Kit' found in <td> tags on {url}.")



Grouped images that start with 'Kit' on https://en.wikipedia.org/wiki/AFC_Bournemouth:
Group 1:
https://upload.wikimedia.org/wikipedia/commons/4/4f/Kit_left_arm_bournemouth2425h.png
https://upload.wikimedia.org/wikipedia/commons/thumb/7/7f/Kit_left_arm.svg/31px-Kit_left_arm.svg.png
https://upload.wikimedia.org/wikipedia/commons/b/b0/Kit_body_bournemouth2425h.png
https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Kit_body.svg/38px-Kit_body.svg.png
https://upload.wikimedia.org/wikipedia/commons/0/01/Kit_right_arm_bournemouth2425h.png
https://upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Kit_right_arm.svg/31px-Kit_right_arm.svg.png
https://upload.wikimedia.org/wikipedia/commons/e/ec/Kit_shorts_bournemouth2425h.png
https://upload.wikimedia.org/wikipedia/commons/thumb/a/af/Kit_shorts.svg/100px-Kit_shorts.svg.png
https://upload.wikimedia.org/wikipedia/commons/f/f3/Kit_socks_redtop.png
https://upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Kit_socks_long.svg/100px-Kit_socks_long

KeyboardInterrupt: 

In [36]:
import os
import requests
from bs4 import BeautifulSoup

# List of search terms
search_terms = ["Kit_left_arm",
"Kit_body",
"Kit_right_arm",
"Kit_shorts"]

def get_images_by_search_terms(td, search_terms):
    """
    Returns a dictionary with search terms as keys and the list of corresponding image URLs as values.
    """
    images_by_term = {term: [] for term in search_terms}
    
    # Find all <img> tags within this <td>
    img_tags = td.find_all('img')

    for img in img_tags:
        src = img.get('src')
        if src:
            # Check if the image filename matches any of the search terms
            for term in search_terms:
                if term in src.split('/')[-1]:  # Check if the filename contains the search term

                    div_above = img.find_parent('div').find_previous('div')

                    # Get the background-color if it exists in the inline style attribute
                    if div_above and 'style' in div_above.attrs:
                        styles = div_above['style']
                        styles = styles.replace(" ","")
                        # Look for background-color in the style attribute
                        style_dict = dict(item.split(":") for item in styles.split(";") if item)

                        background_color = style_dict.get('background-color', None)

                    full_img_url = 'https:' + src
                    images_by_term[term].append((full_img_url, background_color))

    return images_by_term

def get_grouped_images_by_td_and_terms(url, search_terms):
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <td> tags that do not contain other <td> tags
    td_tags = [td for td in soup.find_all('td') if not td.find('td')]

    grouped_images = []

    # Loop through each <td> and find images within it by search terms
    for td in td_tags:
        images_by_term = get_images_by_search_terms(td, search_terms)
        
        # Only add this group if we found any images for any search term
        if any(images_by_term[term] for term in search_terms):
            grouped_images.append(images_by_term)

    return grouped_images

def download_images(grouped_images, team_name, year):
    # Create the team directory if it doesn't exist
    team_dir = os.path.join('downloads', team_name, str(year))
    os.makedirs(team_dir, exist_ok=True)

    # Loop through the image groups (by <td>)
    for i, group in enumerate(grouped_images, start=1):
        group_dir = os.path.join(team_dir, f'Group_{i}')
        os.makedirs(group_dir, exist_ok=True)
        
        # Loop through each search term's images in the group
        for term, images in group.items():

            # Download each image
            img_response = requests.get(images[0][0])
            if img_response.status_code == 200:
                # Save the image in the corresponding term folder
                img_filename = os.path.join(group_dir, f'{term}-{images[0][1]}.{images[0][0].split(".")[-1]}')
                with open(img_filename, 'wb') as img_file:
                    img_file.write(img_response.content)
                print(f"Downloaded {img_filename}")
            else:
                print(f"Failed to download image {images[0]}")



# Loop through the list of Wikipedia URLs, get the images grouped by <td> and search terms, and download them
for team in team_labels:

    for year in range(2025, 2020, -1):

        url_template = f"https://en.wikipedia.org/wiki/{year-1}–{abs(year) % 100}_{team}_season"

        grouped_images = get_grouped_images_by_td_and_terms(url_template, search_terms)
        if grouped_images:
            print(f"Downloading images for team {team}...")
            download_images(grouped_images, team, year)
        else:
            print(f"No images matching search terms found for team {team}.")


Downloading images for team AFC_Bournemouth...
Downloaded downloads\AFC_Bournemouth\2025\Group_1\Kit_left_arm-None.png
Downloaded downloads\AFC_Bournemouth\2025\Group_1\Kit_body-None.png
Downloaded downloads\AFC_Bournemouth\2025\Group_1\Kit_right_arm-None.png
Downloaded downloads\AFC_Bournemouth\2025\Group_1\Kit_shorts-None.png
Downloading images for team AFC_Bournemouth...
Downloaded downloads\AFC_Bournemouth\2024\Group_1\Kit_left_arm-None.png
Downloaded downloads\AFC_Bournemouth\2024\Group_1\Kit_body-None.png
Downloaded downloads\AFC_Bournemouth\2024\Group_1\Kit_right_arm-None.png
Downloaded downloads\AFC_Bournemouth\2024\Group_1\Kit_shorts-None.png
Downloaded downloads\AFC_Bournemouth\2024\Group_2\Kit_left_arm-None.png
Downloaded downloads\AFC_Bournemouth\2024\Group_2\Kit_body-None.png
Downloaded downloads\AFC_Bournemouth\2024\Group_2\Kit_right_arm-None.png
Downloaded downloads\AFC_Bournemouth\2024\Group_2\Kit_shorts-None.png
Downloaded downloads\AFC_Bournemouth\2024\Group_3\Kit_lef

KeyboardInterrupt: 

In [38]:
from PIL import Image

# Define the path to the existing image and the new output file
input_image_path = './downloads/AFC_Bournemouth/2023/Group_2/Kit_shorts-#5868D9.png'
output_image_path = './downloads/AFC_Bournemouth/2023/Group_2/image_with_background.png'

# Define the new background color (R, G, B)
background_color = (255, 0, 0)  # Red background

# Open the existing image
existing_image = Image.open(input_image_path)

# Get the size of the existing image
width, height = existing_image.size

# Create a new image with the same size and the background color
background = Image.new('RGB', (width, height), background_color)

# Paste the existing image onto the background
# We use the alpha channel if the existing image has transparency
background.paste(existing_image, (0, 0), existing_image.convert('RGBA'))

# Save the new image
background.save(output_image_path)

print(f"Image with background color created and saved as '{output_image_path}'")

Image with background color created and saved as './downloads/AFC_Bournemouth/2023/Group_2/image_with_background.png'
