# Notebook for evaluation of the Unicorn prediction model. The dataset provided has already known unicorns in it. The confusion matrix and accuracy is used to evaluate the Model.

## Data cleaning

In [2]:
import os
from bs4 import BeautifulSoup
import csv
import statistics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import binarize
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta
import math

In [237]:
unicorns = []
with open('D:\\cmpe256\\Team project\\evaluation\\unicorns.csv','r') as unicornfile:
    reader = csv.reader(unicornfile)
    reader.__next__()
    for row in reader:
        unicorns.append(row[0])
unicornfile.close()

In [236]:
with open('D:\\cmpe256\\Team project\\evaluation\\startups6.csv','r') as file1:
    reader = csv.reader(file1)
    reader.__next__()
    with open('D:\\cmpe256\\Team project\\evaluation\\startups5.csv','w',newline="") as file2:
        writer = csv.writer(file2)
        writer.writerow(['Name','Start date','Funding','Categories','Number of Articles','Status'])        
        for row in reader:
            if not row[1].strip():
                continue
            if row[2] == '':
                continue
            if row[3] == '':
                continue
            if row[4] == '':
                continue
            if row[0] in unicorns:
                row.append('Unicorn')
            else:
                row.append('Not Unicorns')
            row.append('')
            writer.writerow(row)
    file2.close()
file1.close()

In [51]:
categories = []
article_cnt = []
investments = []
founding_dates = []
names = []
with open('D:\\cmpe256\\Team project\\evaluation\\startups5.csv','r') as file1:
    reader = csv.reader(file1)
    reader.__next__()
    for row in reader:
        categories.append(row[3].split(','))
        names.append(row[0])
        article_cnt.append(row[4])
        investments.append(row[2])
        founding_dates.append(row[1])
file1.close()

## Attributes

### Categories

In [52]:
for category in categories:
    for i,name in enumerate(category):
        category[i] = name.strip().replace(' ','_').replace('-','_')

### Time

In [53]:
for i,fdate in enumerate(founding_dates):
    fdate = fdate.strip()
    founding_dates[i] = fdate

### Popularity

In [54]:
for i,article in enumerate(article_cnt):
    article = article.strip()
    article = article.replace(",","")
    article_cnt[i] = article

### Investment amount

In [55]:
for i,money in enumerate(investments):
    money = money.replace('$','')
    money = money.replace(',','')
    investments[i] = money

## Data preprocessing 1

### Categories

In [56]:
def generate_corpus(categories):
    corpus = []
    for category in categories:
        doc = ""
        for name in category:
            doc = doc + name + " "
        corpus.append(doc.strip())
    return corpus

SMOOTHING = 25
def cosine(X):
    return cosine_similarity(X)

def cosine_smooth(X):
    similarity_matrix = cosine_similarity(X)
    i = 0
    while(i<X.shape[0]):
        j = 0
        while(j<X.shape[0]):
            overlap = np.dot(binarize(X.todense()[i]), binarize(X.todense()[j]).T)[0, 0]
            similarity_matrix[i][j] *= (overlap/(overlap + SMOOTHING))
            j+=1
        i+=1
    return similarity_matrix

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = generate_corpus(categories)
vectorizer = TfidfVectorizer(norm=None)
X = vectorizer.fit_transform(corpus)

In [62]:
similarity_matrix = cosine(X)
similarity_matrix = similarity_matrix.tolist()

i=0
for sim in similarity_matrix:
    del sim[i]
    i+=1
similarity_matrix = np.array(similarity_matrix)

similarity_index = []
for sim in similarity_matrix:
    similarity_index.append(statistics.mean(sim))

### Time

In [70]:
month_map = {'jan':1,
            'feb':2,
            'mar':3,
            'apr':4,
            'may':5,
            'jun':6,
            'jul':7,
            'aug':8,
            'sep':9,
            'oct':10,
            'nov':11,
            'dec':12}

In [71]:
now = datetime.datetime.now()
duration = []
for sdate in founding_dates:
    day = 1
    month = 1
    if len(sdate) > 4:
        month = month_map[sdate[:3].lower()]
        year = sdate[-4:]
    else:
        year = sdate
            
    startdate = datetime.datetime(int(year),int(month),int(day),0,0,0,0)
    difference = relativedelta(now,startdate)
    duration.append(difference.years) 

### Popularity

In [72]:
popularity_score = []
for count,years in zip(article_cnt,duration):
    rate = float(count)/float(years)
    popularity_score.append(rate*0.6 + float(count)*0.4)

### Investment Amount

In [73]:
number_map = {'K':1000,
             'M':1000000,
             'B':1000000000}
for i,money in enumerate(investments):
    money = float(money[:-1]) * number_map[money[-1:]]
    investments[i] = money

In [74]:
decay_rate = 0.1
i = 0
for money,years in zip(investments,duration):
    new_money = int(money)*math.exp(-decay_rate * years)
    investments[i] = new_money
    i+=1

## Data preprocessing 2 (Data scaling)

In [87]:
dataset = []
for i in range(len(names)):
    dataset.append([similarity_index[i],popularity_score[i],investments[i]])

In [88]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1,10))
scaler.fit(dataset)
scaled_data = scaler.transform(dataset)

## Data modeling (Prediction)

In [275]:
result_list = []
for name,data in zip(names,scaled_data):
    score = (data[1]*data[2]) / data[0]
    result_list.append([name,score])

In [273]:
existing_unicorns = []
with open('D:\\cmpe256\\Team project\\unicorns.csv','r') as file1:
    reader1 = csv.reader(file1)
    reader1.__next__()
    for row in reader1:
        existing_unicorns.append(row[0])
file1.close()

In [280]:
actual_unicorns=[]
print(len(result_list))
for row in result_list:
    if row[0] in existing_unicorns:
        actual_unicorns.append(row[0])
print(len(result_list))

137
137


In [None]:
print(sorted(result_list,key=lambda l:l[1],reverse=True))
print(len(result_list))
"""for name,data in zip(names,scaled_data):
    print(name,data)"""

In [284]:
predicted_unicorns = []
sorted_list = sorted(result_list,key=lambda l:l[1],reverse=True)
temp = sorted_list[:9]
for row in temp:
    predicted_unicorns.append(row[0])
print(predicted_unicorns)

['Lyft', 'Spotify', 'Instacart', 'Oscar Health', 'STILT', 'Postmates', 'Coinbase', 'MaestroQA', 'Medium']


In [291]:
true_positive = 0
for name in actual_unicorns:
    if name in predicted_unicorns:
        true_positive+=1
false_positive = len(predicted_unicorns) - true_positive
true_negative = len(result_list) - len(predicted_unicorns) - false_positive
false_negative = false_positive
print("true_positive:",true_positive)
print("false_positive:",false_positive)
print("true_negative:",true_negative)
print("false_negative:",false_negative)

ppv = true_positive / (true_positive+false_positive)
npv = true_negative / (false_negative+true_negative)
print(ppv)
print(npv)

true_positive: 6
false_positive: 3
true_negative: 125
false_negative: 3
0.6666666666666666
0.9765625
