# Load Packages/Libraries


In [172]:
# Load Library
import time
import numpy as np
import pandas as pd
import pycountry as pc
from datetime import datetime, timedelta
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
today = datetime.today()
day   = today.day if today.day > 9 else '0' + str(today.day)
month = today.month if today.month > 9 else '0' + str(today.month)
today_str = '{}/{}/{}'.format(day, month, today.year)

# Fetch data
df = pd.read_csv("../input/the-complete-list-of-unicorn-companies/unicorn.csv")
df

# Initial Dataset Check

In [173]:
#Removing 
df = df.drop(['Unnamed: 0'], axis = 1)
df

In [174]:
df.info()
#Lets now check for null fields
import seaborn as sns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
df.isnull().sum()

In [175]:
# Duplicates VALUE 
print(f'Duplicates in the dataset: {df.duplicated().sum()}')
print(f'Percentage of duplicates: {df.duplicated().sum()/len(df)*100}%') # if 0.0 % that means No Duplicate data

In [176]:
#Cardinality 
df.nunique() # To determine the maximum and minimum number of variations in each column of the dataset

In [177]:
#Data Types 
df.dtypes # In order to prepare for the next step, you need to figure out what type of dataset you have.


# Distribution OF Dataset on the Minimum variations columnn property


In [178]:
# Figure size 
plt.figure(figsize=(30,30))
# Pie plot
df['Industry'].value_counts().plot.pie(autopct='%1.1f%%', textprops={'fontsize':12}).set_title("Target distribution")


In [179]:
df['Industry'].value_counts()# On this column, there is a lot of distrubution.

In [180]:
# data describtion
df.describe().T.style.background_gradient() # Significant Parametores Are Highlighted

In [181]:
df["Valuation ($B)"]=df["Valuation ($B)"].str.replace('$','').astype(float)# Data changes only once


In [223]:
#df

# Renaming the comman industry to a single form

In [224]:
# changing comman industry name to single formm
for i in df['Industry']:
    if i=='Internet software services':
        df["Industry"]=df["Industry"].str.replace("Internet Software Services", "Internet software & services")
    elif i=="E-commerce & direct-to-consumer":
        df["Industry"]=df["Industry"].str.replace("E-commerce & direct-to-consumer", "Consumer & retail")
    elif i=='Data management & analytics':
        df["Industry"]=df["Industry"].str.replace("Data management & analytics", "Artificial intelligence")
    elif i=="Auto & transportation":
        df["Industry"]=df["Industry"].str.replace('Auto & transportation', "Travel")
df       

In [225]:
x=df['Industry'].value_counts()
x.plot(kind='barh',figsize=(25,20))

In [227]:
sns.set_context('poster', font_scale=0.9)
a=df['Country'].value_counts()
a.plot(kind='barh',figsize=(15,15))

# First, we'll visualise our data depending on the largest investment.

In [189]:
c=df.loc[(df['Valuation ($B)'] >= 15) & (df['Valuation ($B)'] <= 150)]
c=c.head(20)
c.plot(kind='bar', figsize=(25,15))  

# A group of investors who are only willing to contribute a small amount of money.

In [190]:
a=df['Valuation ($B)'].value_counts()
b = a.head(20)
b.plot(kind='bar', figsize=(25,15))

# To obtain information, NLP algorithms are applied to target columns.

In [191]:
# important Library for NLP
import warnings
warnings.filterwarnings('ignore')
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from sklearn.cluster import DBSCAN
import scipy
from scipy.cluster import hierarchy
from sklearn.cluster import AgglomerativeClustering


In [192]:
print("Non-null Value")
df = df[df['Industry'].notnull()]
df=df.dropna(how='any')

# ['Industry'] is the target NLP column.

In [193]:
nltk.download('stopwords')
sno = nltk.stem.SnowballStemmer('english')
stop=set(stopwords.words('english'))
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('japan'))

In [194]:
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in df['Industry'].values:
    filtered_sentence=[]
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [195]:
df['Industry2']=final_string
df['Industry2']=df['Industry2'].str.decode("utf-8")
df['Industry2']

# 1)-KMeans using Bag of Words(BOW) clustering on Industry column

In [196]:
count_vect = CountVectorizer()
bow = count_vect.fit_transform(df['Industry2'].values)
bow.shape
bow

In [197]:
model = KMeans(n_clusters = 10,init='k-means++', random_state=99)
model.fit(bow)

In [198]:
labels = model.labels_
cluster_center=model.cluster_centers_
cluster_center

* Silhouette score is used to evaluate the quality of clusters created using clustering algorithms such as K-Means in terms of how well samples are clustered with other samples that are similar to each other.

In [199]:
print("silhouette_score")# 
silhouette_score = metrics.silhouette_score(bow, labels, metric='euclidean')
silhouette_score

In [200]:
df['Bow Clus Label'] = model.labels_
df.head(2)

In [201]:
print(" Nomber Of Clusters")
plt.bar([x for x in range(10)], df.groupby(['Bow Clus Label'])['Industry2'].count(), alpha = 0.4)
plt.title('KMeans cluster pounts')
plt.xlabel("Cluster number")
plt.ylabel("Number of points")
plt.show()

# The base words column for the industry Words in the group 

In [202]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = count_vect.get_feature_names()
for i in range(10):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()

# On the ['Select Investors'] page, industry clusters are grouped together.

In [203]:
print("Country Base Clusters And Investrers ")
df_press_clus=df.groupby(['Bow Clus Label','Select Investors'])['Industry2'].sum()
df_press_clus=df_press_clus.reset_index()
df_press_clus=df_press_clus.set_index('Select Investors')
df_press_clus


# On the basis of the company name, industry clusters are grouped.

In [204]:
print("Name Base Clusters And country ")
df_key_clus=df.groupby(['Bow Clus Label','Country'])['Industry2'].sum()
df_key_clus=df_key_clus.reset_index()
df_key_clus=df_key_clus.set_index('Country')
df_key_clus


In [205]:
print("Name Base Clusters And company ")
df_key_clus=df.groupby(['Bow Clus Label','Company'])['Industry2'].sum()
df_key_clus=df_key_clus.reset_index()
df_key_clus=df_key_clus.set_index('Company')
df_key_clus


# 2)-KMeans using the TF-IDF

In [206]:
tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(df['Industry2'].values)
tfidf.shape

In [207]:
model_tf = KMeans(n_clusters = 10,random_state=99)
model_tf.fit(tfidf)

In [208]:
labels_tf = model_tf.labels_
cluster_center_tf=model_tf.cluster_centers_
cluster_center_tf

In [209]:
terms1 = tfidf_vect.get_feature_names()
terms1[1:10]

In [210]:
silhouette_score_tf = metrics.silhouette_score(tfidf, labels_tf, metric='euclidean')
silhouette_score_tf

In [211]:
df1 = df
df1['Tfidf Clus Label'] = model_tf.labels_
df1.head(5)

In [212]:
df1.groupby(['Tfidf Clus Label'])['Country'].count()

In [213]:
df1.groupby(['Tfidf Clus Label'])['Company'].count()

In [214]:
print("Top terms per cluster:")
order_centroids = model_tf.cluster_centers_.argsort()[:, ::-1]
for i in range(10):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms1[ind], end='')
        print()

In [215]:
print(" Nomber Of Clusters")
plt.bar([x for x in range(10)], df1.groupby(['Tfidf Clus Label'])['Country'].count(), alpha = 0.4)
plt.title('KMeans cluster points')
plt.xlabel("Cluster number")
plt.ylabel("Number of points")
plt.show()

In [216]:
print(" Nomber Of Clusters")
plt.bar([x for x in range(10)], df1.groupby(['Tfidf Clus Label'])['Company'].count(), alpha = 0.4)
plt.title('KMeans cluster points')
plt.xlabel("Cluster number")
plt.ylabel("Number of points")
plt.show()

# Column clusters by industry and company

In [217]:
print('Review Assigned To  Cluster ')
for i in range(10):
    print("Assigned to cluster ", i)
    print("-" * 70)
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][5]]['Company'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][10]]['Company'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][20]]['Company'])
    print('\n')
    print("_" * 70)

# Clusters of columns by industry and country

In [218]:
print('Review Assigned To  Cluster ')
for i in range(10):
    print("Assigned to cluster ", i)
    print("-" * 70)
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][5]]['Country'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][10]]['Country'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][20]]['Country'])
    print('\n')
    print("_" * 70)

# Industry and Select Investors column clusters

In [219]:
print('Review Assigned To  Cluster ')
for i in range(10):
    print("Assigned to cluster ", i)
    print("-" * 70)
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][5]]['Select Investors'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][10]]['Select Investors'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][20]]['Select Investors'])
    print('\n')
    print("_" * 70)