**Import Package and Datasets**

In [None]:
import numpy as np #array di python
import pandas as pd #data manipulation and analysis di python
import numpy.matlib #matrix library
import matplotlib.pyplot as plt  #untuk plotting data
import seaborn as sns #data visualization untuk matplotlib

# generate random values for cluster initialization
import random
from datetime import datetime


companies_data = pd.read_csv('../input/crunchbasedata/companies.csv', encoding = 'unicode_escape')
invesments_data = pd.read_csv('../input/crunchbasedata/investments.csv', encoding = 'unicode_escape')

**Pre-processing**

In [None]:
#fix typo columns name
companies_data.rename(columns = {' funding_total_usd ': 'funding_total_usd'}, inplace = True)
#copy particular columns into new array
data_DF = companies_data[['name','funding_total_usd', 'funding_rounds','first_funding_at']].copy(deep=True)

#drop row with unavailable funding total value
data_DF.drop(data_DF[data_DF['funding_total_usd'] == '-'].index,inplace=True)
#drop row with NaN funding total value
data_DF.dropna(axis=0,subset=['funding_total_usd'], inplace=True)
#remove digit separator in funding total value
data_DF['funding_total_usd'] = data_DF['funding_total_usd'].str.replace(",","")
#cast funding total str to int type
data_DF['funding_total_usd'] = data_DF['funding_total_usd'].astype('float')
#sort data by funding total decendingly
data_DF.sort_values(by=['funding_total_usd'], ascending=False, inplace=True)

data_DF.reset_index(drop=True, inplace=True)

In [None]:
def zscore(data):
    return (data-data.mean())/data.std()

In [None]:
a = pd.to_datetime(data_DF['first_funding_at'], errors='coerce')
b = pd.to_datetime(data_DF['first_funding_at'], format='%Y%m%d', errors='coerce')
c = b.combine_first(a)

del a,b

funding_years = 2015 - c.dt.year +1

del c

raised_per_year = data_DF['funding_total_usd'] / funding_years
rounds_per_year = data_DF['funding_rounds'] / funding_years

raised_per_year_Z = zscore(raised_per_year)

rounds_per_year_Z = zscore(rounds_per_year)

cluster_data = pd.concat([raised_per_year_Z, rounds_per_year_Z], axis=1).to_numpy()

startup_unicorns = ["Uber","Amazon","Google","Dropbox","Facebook","Alibaba",
                     "Stripe","Airbnb","Robinhood","DigitalOcean","Coursera"]

unicorn_status = data_DF['name'].isin(startup_unicorns)

cluster_data_DF = pd.concat([data_DF['name'],raised_per_year.rename('raised_per_year'), rounds_per_year.rename('rounds_per_year')
                             , unicorn_status.rename('unicorn')], axis=1)

del funding_years, raised_per_year, rounds_per_year


print(cluster_data)

In [None]:
def calc_distance(x1, x2):
    return (sum((x1 - x2)**2))**0.5

In [None]:
def init_cluster(k, cluster_array):

    random.seed(datetime.now())
    rand_point = random.sample(range(cluster_data.shape[0]), k)
    
    init_centr = cluster_array[rand_point]
    
    print("initial centroid:", rand_point)
    
    init_clusters = assign_clusters(init_centr, cluster_array)
    return init_clusters

In [None]:
def assign_clusters(centroids, cluster_array): 
    #array untuk simpan hasil assign cluster yang baru, yang nantinya jadi output fungsi
    clusters = []
    
    #nested loop untuk ngehitung jarak dari setiap point(loop luar) dengan setiap setiap centroid(loop dalam) 
    for i in range(cluster_array.shape[0]):
        #array untuk simpan nilai jarak suatu point terhadap setiap centroid
        distances = []
        for centroid in centroids:
            
            #panggil function untuk menghitung jarak, simpan jarak ke dalam array
            distances.append(calc_distance(centroid, cluster_array[i]))
            
        cluster = np.argmin(distances, axis=0)
        clusters.append(cluster)
    
    return clusters

In [None]:
def calc_centroids(clusters, cluster_array):
    new_centroids = []
    cluster_df = pd.concat([pd.DataFrame(cluster_array),pd.DataFrame(clusters, columns=['cluster'])],axis=1)
    for c in set(cluster_df['cluster']):
        current_cluster = cluster_df[cluster_df['cluster']\
                                     ==c][cluster_df.columns[:-1]]
        cluster_mean = current_cluster.mean(axis=0)
        new_centroids.append(cluster_mean)
    return new_centroids

In [None]:
def calc_centroid_variance(clusters, cluster_array):
    sum_squares = []
    cluster_df = pd.concat([pd.DataFrame(cluster_array),
                            pd.DataFrame(clusters, 
                                         columns=['cluster'])], 
                           axis=1)
    for c in set(cluster_df['cluster']):
        current_cluster = cluster_df[cluster_df['cluster']\
                                     ==c][cluster_df.columns[:-1]]
        cluster_mean = current_cluster.mean(axis=0)
        mean_repmat = np.matlib.repmat(cluster_mean, 
                                       current_cluster.shape[0],1)
        sum_squares.append(np.sum(np.sum((current_cluster - mean_repmat)**2)))
    return sum_squares

In [None]:
def classify(centroids, testing_data):

    #array untuk simpan nilai jarak suatu point terhadap setiap centroid
    distances = []
    for centroid in centroids:
        #panggil function untuk menghitung jarak, simpan jarak ke dalam array
        distances.append(calc_distance(centroid, testing_data))
        
    print('distances: ', distances)
    closest_centr = np.argmin(distances, axis=0)
    
    return closest_centr

In [None]:
#k = number of cluster(s)
k = 6
cluster_vars = []

random.seed(datetime.now())
value = random.sample(range(cluster_data.shape[0]), k)

initial_clusters = init_cluster(k, cluster_data)
clusters = initial_clusters

cluster_data_DF['cluster_label'] = clusters

cluster_vars.append(np.mean(calc_centroid_variance(clusters, cluster_data)))
print(0, cluster_vars[0])


In [None]:
for i in range(20):
    centroids = calc_centroids(clusters, cluster_data)
    clusters = assign_clusters(centroids, cluster_data)
    cluster_var = np.mean(calc_centroid_variance(clusters, 
                                                 cluster_data))
    cluster_vars.append(cluster_var)
    print(i+1, cluster_var)

In [None]:
sns.set_style("whitegrid")


LABEL_COLOR_MAP = {0 : 'r',
                   1 : 'g',
                   2 : 'b',
                   3 : 'y',
                   4 : 'm',
                   5 : 'c',
                   6 : 'k'
                   }


init_label_color = [LABEL_COLOR_MAP[l] for l in initial_clusters]

plt.figure(figsize=(20, 10))
plt.scatter(cluster_data_DF['raised_per_year'], cluster_data_DF['rounds_per_year'], c=init_label_color)
plt.title('Scatter plot initial cluster')
plt.xlabel('Amount of money raised per year')
plt.ylabel('Number of rounds per year')
plt.show()

label_color = [LABEL_COLOR_MAP[l] for l in clusters]

plt.figure(figsize=(20, 10))
plt.scatter(cluster_data_DF['raised_per_year'], cluster_data_DF['rounds_per_year'], c=label_color)
plt.title('Scatter plot converged cluster')
plt.xlabel('Amount of money raised per year')
plt.ylabel('Number of rounds per year')
plt.show()

In [None]:
#Testing data
first_funding_year= 2010
total_funding=10000000
funding_rounds=3

funding_years = 2015 - first_funding_year +1
funding_per_year = total_funding / funding_years
rounds_per_year = funding_rounds / funding_years

testing_data = np.array([funding_per_year, rounds_per_year])

print("years of funding: ", funding_years)
print("testing data: ", testing_data)

print('\n')

testing_data_label = classify(centroids, testing_data)
print("testing data cluster: ", testing_data_label)

print('\n')

cluster_unicorn = cluster_data_DF[(cluster_data_DF['cluster_label'] == testing_data_label) & (cluster_data_DF['unicorn'] == True)]
print('unicorn in cluster', testing_data_label, ':')
print(cluster_unicorn)