In [3]:
import os
from bs4 import BeautifulSoup
import csv
import statistics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import binarize
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta
import math

## Data cleaning

### Removing data with missing values

In [176]:
with open('D:\\cmpe256\\Team project\\startups6.csv','r') as file1:
    reader = csv.reader(file1)
    reader.__next__()
    with open('D:\\cmpe256\\Team project\\startups5.csv','w',newline="") as file2:
        writer = csv.writer(file2)
        writer.writerow(['Name','Start date','Funding','Categories','Number of Articles'])        
        for row in reader:
            if not row[1].strip():
                continue
            if row[2] == '':
                continue
            if row[3] == '':
                continue
            if row[4] == '':
                continue
            writer.writerow(row)
    file2.close()
file1.close()

In [4]:
categories = []
article_cnt = []
investments = []
founding_dates = []
names = []
with open('D:\\cmpe256\\Team project\\startups5.csv','r') as file1:
    reader = csv.reader(file1)
    reader.__next__()
    for row in reader:
        categories.append(row[3].split(','))
        names.append(row[0])
        article_cnt.append(row[4])
        investments.append(row[2])
        founding_dates.append(row[1])
file1.close()

## Attributes

### Categories

In [5]:
for category in categories:
    for i,name in enumerate(category):
        category[i] = name.strip().replace(' ','_').replace('-','_')

### Time

In [6]:
for i,fdate in enumerate(founding_dates):
    fdate = fdate.strip()
    founding_dates[i] = fdate

### Popularity

In [7]:
for i,article in enumerate(article_cnt):
    article = article.strip()
    article = article.replace(",","")
    article_cnt[i] = article

### Investment amount

In [8]:
for i,money in enumerate(investments):
    money = money.replace('$','')
    money = money.replace(',','')
    investments[i] = money

## Data preprocessing

### Categories

In [9]:
def generate_corpus(categories):
    corpus = []
    for category in categories:
        doc = ""
        for name in category:
            doc = doc + name + " "
        corpus.append(doc.strip())
    return corpus

SMOOTHING = 25
def cosine(X):
    return cosine_similarity(X)

def cosine_smooth(X):
    similarity_matrix = cosine_similarity(X)
    i = 0
    while(i<X.shape[0]):
        j = 0
        while(j<X.shape[0]):
            overlap = np.dot(binarize(X.todense()[i]), binarize(X.todense()[j]).T)[0, 0]
            similarity_matrix[i][j] *= (overlap/(overlap + SMOOTHING))
            j+=1
        i+=1
    return similarity_matrix

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = generate_corpus(categories)
vectorizer = TfidfVectorizer(norm=None)
X = vectorizer.fit_transform(corpus)

In [11]:
similarity_matrix = cosine(X)
similarity_matrix = similarity_matrix.tolist()

i=0
for sim in similarity_matrix:
    del sim[i]
    i+=1
similarity_matrix = np.array(similarity_matrix)

similarity_index = []
for sim in similarity_matrix:
    similarity_index.append(statistics.mean(sim))

### Time

In [12]:
month_map = {'jan':1,
            'feb':2,
            'mar':3,
            'apr':4,
            'may':5,
            'jun':6,
            'jul':7,
            'aug':8,
            'sep':9,
            'oct':10,
            'nov':11,
            'dec':12}

In [13]:
now = datetime.datetime.now()
duration = []
for sdate in founding_dates:
    day = 1
    month = 1
    if len(sdate) > 4:
        month = month_map[sdate[:3].lower()]
        year = sdate[-4:]
    else:
        year = sdate
            
    startdate = datetime.datetime(int(year),int(month),int(day),0,0,0,0)
    difference = relativedelta(now,startdate)
    duration.append(difference.years) 

### Popularity

In [14]:
popularity_score = []
for count,years in zip(article_cnt,duration):
    rate = float(count)/float(years)
    popularity_score.append(rate*0.6 + float(count)*0.4)

### Investment Amount

In [15]:
number_map = {'K':1000,
             'M':1000000,
             'B':1000000000}
for i,money in enumerate(investments):
    money = float(money[:-1]) * number_map[money[-1:]]
    investments[i] = money

In [16]:
decay_rate = 0.1
i = 0
for money,years in zip(investments,duration):
    new_money = int(money)*math.exp(-decay_rate * years)
    investments[i] = new_money
    i+=1

## Feature scaling

In [17]:
dataset = []
for i in range(len(names)):
    dataset.append([similarity_index[i],popularity_score[i],investments[i]])

In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1,10))
scaler.fit(dataset)
scaled_data = scaler.transform(dataset)

## Data modeling (Prediction)

In [19]:
result_list = []
for name,data in zip(names,scaled_data):
    score = (data[1]*data[2]) / data[0]
    result_list.append([name,score])

In [20]:
existing_unicorns = []
with open('D:\\cmpe256\\Team project\\unicorns.csv','r') as file1:
    reader1 = csv.reader(file1)
    reader1.__next__()
    for row in reader1:
        existing_unicorns.append(row[0])
file1.close()

In [21]:
print(len(result_list))
for row in result_list:
    if row[0] in existing_unicorns:
        result_list.remove(row)
print(len(result_list))

137
128


In [22]:
print(sorted(result_list,key=lambda l:l[1],reverse=True))
"""for name,data in zip(names,scaled_data):
    print(name,data)"""

[['STILT', 1.008625959571221], ['MaestroQA', 0.8038436766921764], ['Medium', 0.7735594160440766], ['SolveBio', 0.704387005560632], ['Chai', 0.7006141491943867], ['SupplyFrame', 0.6781731402064519], ['Blokable', 0.6498224217121641], ['Crunchyroll', 0.618465486880035], ['Bright', 0.6151271611470442], ['Zymergen', 0.6026639897023525], ['Maestro', 0.5782705556084141], ['Formlabs', 0.5678443891554801], ['TransferWise', 0.5555708918640171], ['ID.me', 0.48666768728450815], ['Tinder', 0.4677954525403571], ['Retention Science', 0.4516399094225045], ['Blockstream', 0.4492432073700591], ['Hologram', 0.44801188250724], ['ClassDojo', 0.4237461701771803], ['Hammerhead', 0.42239534270557894], ['Mason', 0.41975722051741976], ['Nearpod', 0.4095030241416303], ['PeerStreet', 0.4010420724037284], ['Shyp', 0.3855010690335202], ['Munchery', 0.38046656109049826], ['Drop', 0.37600837505891765], ['Blockstack', 0.3716287918291885], ['Breather', 0.35292252396733587], ['Mark43', 0.34570879188825704], ['Metromile'

'for name,data in zip(names,scaled_data):\n    print(name,data)'

In [231]:
with open('D:\\cmpe256\\Team project\\results\\startups.csv','w',newline='') as file1:
    writer = csv.writer(file1)
    for row in result_list:
        writer.writerow([row[0]])
file1.close()

with open('D:\\cmpe256\\Team project\\results\\predicted_unicorns.csv','w',newline='') as file1:
    writer = csv.writer(file1)
    sorted_list = sorted(result_list,key=lambda l:l[1],reverse=True)
    count = 5
    for row in sorted_list:
        writer.writerow([row[0]])
        count-=1
        if count == 0:
            break
file1.close()

In [25]:
print(len(result_list))
unicorn_index = sorted(result_list,key=lambda l:l[1],reverse=True)
for startup in unicorn_index:
    print(startup)

128
['STILT', 1.008625959571221]
['MaestroQA', 0.8038436766921764]
['Medium', 0.7735594160440766]
['SolveBio', 0.704387005560632]
['Chai', 0.7006141491943867]
['SupplyFrame', 0.6781731402064519]
['Blokable', 0.6498224217121641]
['Crunchyroll', 0.618465486880035]
['Bright', 0.6151271611470442]
['Zymergen', 0.6026639897023525]
['Maestro', 0.5782705556084141]
['Formlabs', 0.5678443891554801]
['TransferWise', 0.5555708918640171]
['ID.me', 0.48666768728450815]
['Tinder', 0.4677954525403571]
['Retention Science', 0.4516399094225045]
['Blockstream', 0.4492432073700591]
['Hologram', 0.44801188250724]
['ClassDojo', 0.4237461701771803]
['Hammerhead', 0.42239534270557894]
['Mason', 0.41975722051741976]
['Nearpod', 0.4095030241416303]
['PeerStreet', 0.4010420724037284]
['Shyp', 0.3855010690335202]
['Munchery', 0.38046656109049826]
['Drop', 0.37600837505891765]
['Blockstack', 0.3716287918291885]
['Breather', 0.35292252396733587]
['Mark43', 0.34570879188825704]
['Metromile', 0.3390738364737345]
['St