# Pitch Deck Image Dataset Exploration (part 1)

## Import Libraries

In [1]:
import re
import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shutil # used for copying files to directories

## Import Dataset

In [2]:
df = pd.read_csv('data/falory_clean_exploration.csv')
print(str(len(df)) + " startups")
print(str(round((len(df)/497)*100,1)) + "% of raw dataset")
df.head()

282 startups
56.7% of raw dataset


Unnamed: 0,Name,Description,Sector,Business,customer_B2B,customer_B2B2C,customer_B2C,customer_B2G,customer_C2C,Round,...,investor_VC,Year,City,State,Country,Geo_Lat,Geo_Long,Links,AmountRaisedEuro,OrderOfMagnitude
0,Airbnb,Airbnb is an online marketplace for people to ...,2,1,1,0,1,0,0,1,...,1,2008,San Francisco,California,United States,37.779026,-122.419906,"['https://airbnb.com/', 'https://www.crunchbas...",636000.0,5
1,Alan,Alan is a platform that uses technology to mak...,1,3,1,0,0,0,0,4,...,1,2020,Paris,Ile-de-France,France,48.853495,2.348391,"['https://alan.com/', 'https://www.crunchbase....",57240000.0,7
2,Apptopia,Apptopia serves the mobile industry with an ar...,0,1,1,0,0,0,0,2,...,1,2018,Boston,Massachusetts,United States,42.355433,-71.060511,"['http://www.apptopia.com/', 'https://www.crun...",1547600.0,6
3,Beatdapp,Beatdapp provides real-time tracking of media ...,3,0,1,0,0,0,0,1,...,1,2019,Vancouver,British Columbia,Canada,49.260872,-123.113952,"['https://beatdapp.com/', 'https://www.crunchb...",2544000.0,6
4,Bind,"Bind provides personalized, cost-transparent a...",5,1,1,0,1,0,0,2,...,1,2018,Minneapolis,Minnesota,United States,44.9773,-93.265469,"['https://www.yourbind.com/', 'https://www.cru...",63600000.0,7


In [3]:
parent_folder = './data/'
path = os.path.join(parent_folder, 'images')

try:  
    os.mkdir(path)
    print("Directory created: image-dataset") 
except OSError as error:  
    print(error) 

parent_folder = path
class_dirs = [str(int(i)) for i in df['OrderOfMagnitude'].unique()]
#4=ten thousand
#5=one hundred thousand
#6=one million
#7=ten million
#8=one hundred million
for dir in class_dirs:
    path = os.path.join(parent_folder, dir)
    try:  
        os.mkdir(path)
        print("Directory '% s' created" % dir) 
    except OSError as error:  
        print(error)


Directory created: image-dataset
Directory '5' created
Directory '7' created
Directory '6' created
Directory '8' created
Directory '4' created


## Import images

In [4]:
# Path to the directory containing the zip files
zip_folder = './data/pitches/'

#fixs issue with python where 10 is only sorted by first digit..eg 1,10,2,3
def extract_and_sort_key(s):
    parts = s.split('-')
    number = int(parts[-2])
    return number

def get_images(image_folder):
    image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'))]
    image_files_sorted = sorted(image_files, key=extract_and_sort_key)#needed to fix issue with python where 10 is only sorted by first digit..eg 1,10,2,3
    print("\t got " + str(len(image_files_sorted)) + " images")
    return image_files_sorted

# loop through df getting name and OrderOfMagnitude for each row
for index, row in df.iterrows():
    name = row['Name'].lower()
    oom = str(int(row['OrderOfMagnitude']))
    #print(name, oom)

    # Iterate over the zip files in the specified folder
    for zip_file in os.listdir(zip_folder):
        if zip_file.lower().endswith('.zip'):
            # Create a PDF from the extracted images
            zip_name = os.path.splitext(zip_file)[0]
            if zip_name == name:
                # Extract the contents of the zip file into a temporary folder
                zip_path = os.path.join(zip_folder, zip_file)
                extract_folder = os.path.join(zip_folder, os.path.splitext(zip_file)[0])
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_folder)
                print("unzipped: " + extract_folder)
                images = get_images(extract_folder)

                for image in images:
                    # Specify the path of the destination directory you want to copy to
                    destination_directory = os.path.join(parent_folder, oom)
                    # Use the shutil.copy() method to copy the file to the destination directory
                    shutil.copy(os.path.join(extract_folder, image), destination_directory)
                    print("\t copied " + image + " to " + destination_directory)

                    #get number from image file name
                    image = image.lower()
                    output_string = re.sub(r'^.*?(\d+-\d+\.jpg)$', r'(\1)', image)
                    match = re.match(r'\((\d+)-\d+\.jpg\)', output_string)
                    if match:
                        image_number = str(match.group(1))
                        if(len(image_number)<2):#add a zero onto text
                            image_number = image_number.rjust(2,'0')
                        new_name = f"{oom}.{name}.{image_number}.jpg"
                        os.rename(os.path.join(destination_directory, image), os.path.join(destination_directory, new_name))
                        print("\t renamed: " + image + " to " + new_name)
                    else:
                        raise Exception("Can't find number in: " + image)
                    
                    #may fail if zip file empty which is a scraping issue to be resolved prior to this
                    #alloy was missing, friend-trusted not as jpg, both 
                    #friend-trusted and ring were in a subfolder
                    #ring didn't have -2048 before .jpg
                # Clean up the temporary folder
                shutil.rmtree(extract_folder)
                print("\t zip file: " + extract_folder + " deleted")
        

unzipped: ./data/pitches/airbnb
	 got 13 images
	 copied original-airbnb-pitch-deck-2008-1-2048.jpg to ./data/images\5
	 renamed: original-airbnb-pitch-deck-2008-1-2048.jpg to 5.airbnb.01.jpg
	 copied original-airbnb-pitch-deck-2008-2-2048.jpg to ./data/images\5
	 renamed: original-airbnb-pitch-deck-2008-2-2048.jpg to 5.airbnb.02.jpg
	 copied original-airbnb-pitch-deck-2008-3-2048.jpg to ./data/images\5
	 renamed: original-airbnb-pitch-deck-2008-3-2048.jpg to 5.airbnb.03.jpg
	 copied original-airbnb-pitch-deck-2008-4-2048.jpg to ./data/images\5
	 renamed: original-airbnb-pitch-deck-2008-4-2048.jpg to 5.airbnb.04.jpg
	 copied original-airbnb-pitch-deck-2008-5-2048.jpg to ./data/images\5
	 renamed: original-airbnb-pitch-deck-2008-5-2048.jpg to 5.airbnb.05.jpg
	 copied original-airbnb-pitch-deck-2008-6-2048.jpg to ./data/images\5
	 renamed: original-airbnb-pitch-deck-2008-6-2048.jpg to 5.airbnb.06.jpg
	 copied original-airbnb-pitch-deck-2008-7-2048.jpg to ./data/images\5
	 renamed: origin