In [14]:
import math
import re
def normalize_strings(string):
    if isinstance(string, str):
        return string
    if math.isnan(string):
        return None
    raise Exception("not valid string")

def normalize_twitter_name(twitter_url):
    if isinstance(twitter_url, str):
        return re.split(r'twitter.com/', twitter_url)[-1].lower()
    if math.isnan(twitter_url):
        return None
    raise Exception('not valid values for Twitter Handler')

def normalize_url(url):
    if isinstance(url, str):
        main_domain = re.split(r'http(s)?://', url)[-1]
        return f"https://{main_domain}"
    if math.isnan(url):
        return None
    raise Exception("not valid url value")
    
def normalize_category(category):
    if category == "1-10":
        return 1
    if category == "11-50":
        return 2
    if category == "51-200":
        return 3
    if category == "201-500":
        return 4
    if category == "501-1000":
        return 5
    if category == "1001-5000":
        return 6
    if category == "5001-10000":
        return 7
    if category == "10,001+":
        return 8
    if math.isnan(category):
        return None
    raise Exception("not valid value for int category")

def normalize_ceo_id(ceo_name):
    if isinstance(ceo_name, str):
        for key, value in dict(enumerate(data['CEO name'].cat.categories)).items():
            if value == ceo_name:
                return key
    if math.isnan(ceo_name):
        return None
    raise Exception('not valid ceo name')

In [15]:
import pandas as pd

data = pd.read_csv("starter_startup.csv", sep=";")

In [16]:
data["Company name"] = data["Company name"].str.strip()
data["Company Twitter"] = data["Company Twitter"].apply(normalize_twitter_name)
data["employees_amount_id"] = data["Nb. of Employees"].apply(normalize_category)
data["Nb. of Employees"] = data["Nb. of Employees"].astype('category')
data["What do they do (Verbatim - 10 words max.)   "] = data["What do they do (Verbatim - 10 words max.)   "].str.strip()
data["Link to website"] = data["Link to website"].apply(normalize_url)
data["HQ City"] = data["HQ City"].apply(normalize_strings) # Should be left as it is, and later use a postal code better
data["HQ Country"] = data["HQ Country"].apply(normalize_strings)# Should be left as it is, and later use the postal code and a geoposition API to get the Country
data["CEO name"] = data["CEO name"].str.title().str.strip()
data['CEO name'] = data['CEO name'].astype('category')
data["ceo_id"] = data['CEO name'].apply(normalize_ceo_id)
data["CEO's Twitter"] = data["CEO's Twitter"].apply(normalize_twitter_name)
data["Link to jobpage"] = data["Link to jobpage"].apply(normalize_url)

In [36]:
data[data["Company name"] == "KeyCDN"]

Unnamed: 0,Company name,Company Twitter,Nb. of Employees,What do they do (Verbatim - 10 words max.),Link to website,HQ City,HQ Country,CEO name,CEO's Twitter,Link to jobpage,employees_amount_id,ceo_id
425,KeyCDN,keycdn,1-10,high-performance content delivery network,https://www.keycdn.com/careers,Winterthur,Switzerland,Jonas Krummenacher,jkrummenacher,https://www.keycdn.com,1.0,436


In [32]:
# data[data[pd.notnull(data['CEO name'])].duplicated(['CEO name'])]
# data[pd.notnull(data['CEO name'])]
# categories = data['CEO name'].astype('category')
ceo_names = dict(enumerate(data['CEO name'].cat.categories))
ceos = [{"id": key+1, "name": value, "twitter": data[data["CEO name"] == value]["CEO's Twitter"].values[0]} for key, value in ceo_names.items()]
# data[data["CEO name"] == 'Aaron Bright']["CEO's Twitter"].values[0]
# data.groupby('CEO name').count()
# data[data['CEO name'].map(data['CEO name'].value_counts()) > 1]

In [33]:
ceos

[{'id': 1, 'name': 'Aaron Bright', 'twitter': 'aaronbrightmd'},
 {'id': 2, 'name': "Aaron O'Mullan", 'twitter': 'aaronomullan'},
 {'id': 3, 'name': 'Adam Adelman', 'twitter': None},
 {'id': 4, 'name': 'Adam Johnson', 'twitter': None},
 {'id': 5, 'name': 'Adam Little', 'twitter': 'adam_little'},
 {'id': 6, 'name': 'Adam Pritzker', 'twitter': 'adampritzker'},
 {'id': 7, 'name': 'Adam Robinson', 'twitter': 'adrobins'},
 {'id': 8, 'name': 'Adam Schwartz', 'twitter': 'getadam'},
 {'id': 9, 'name': 'Adam Warski', 'twitter': 'adamwarski'},
 {'id': 10, 'name': 'Adii Pienaar', 'twitter': 'adii'},
 {'id': 11, 'name': 'Aidan Corbett', 'twitter': None},
 {'id': 12, 'name': 'Aidan Mcguire', 'twitter': 'amcguire62'},
 {'id': 13, 'name': 'Ajay Kapur', 'twitter': 'ajay__kapur'},
 {'id': 14, 'name': 'Alain Veuve', 'twitter': 'alainveuve'},
 {'id': 15, 'name': 'Alan Beard', 'twitter': None},
 {'id': 16, 'name': 'Alan P. Naumann', 'twitter': None},
 {'id': 17, 'name': 'Alari Aho', 'twitter': 'ahoalari'},

In [66]:
data.to_csv('./clean_startups.csv', index = False, header=True)