In [4]:
# # # # IMPORTS # # # #

import pandas as pd
import json
import pickle
import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np
import spacy



In [5]:

with open(r'C:\Users\imran\DataspellProjects\WalidCase\data\raw\startup_dataset.csv', 'r', encoding='utf-8', errors='ignore') as f:
    raw_startups = pd.read_csv(f)

raw_industries = pd.read_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\processed\industry_dataset_clean.csv', sep='\t')


## Preliminary data exploration

In this section, we will explore the data to get a better understanding of the data and the problem we are trying to solve.

In [4]:
duplicate_rows = raw_startups[raw_startups.duplicated()]
print("number of duplicate rows: ", duplicate_rows.shape)

number of duplicate rows:  (0, 3)


In [6]:
from tqdm import tqdm_notebook as tqdm
class TextProcessing:
    def __init__(self, df: pd.DataFrame = None, industry=False, startup=False):
        self.nlp = spacy.load("en_core_web_sm")
        self.startups = startup
        self.industries = industry
        if startup:
            self.startups = df.copy()
        elif industry:
            self.industries = df.copy()
        else:
            raise ValueError("Please specify if the data is for startups or industries")

    def __iterate_rows(self):
        for index, row in tqdm(self.startups.iterrows()):
            self.index = index
            if self.industries:
                self.about_us = row["keywords"]
            else:
                self.about_us = row["cb_description"]
            yield self

    def length_range(self, data=None, length_range=(30, 150)):

        if data is not None:
            if self.industries:
                self.industries = data.copy()
            else:
                self.startups = data.copy()
        about_us_lengths = {id: len(about.split()) for id, about in zip(self.startups['id'], self.startups['cb_description']) if length_range[0] < len(about.split()) < length_range[1]}

        return about_us_lengths

    def remove_non_english_tokens(self, data=None):
        if data is not None:
            if self.industries:
                self.industries = data.copy()
            else:
                self.startups = data.copy()
        english_tokens = []
        for description in self.__iterate_rows():
            doc = self.nlp(self.about_us)
            tokens = [token.text for token in doc if token.lang_ == 'en' and token.is_alpha]
            self.about_us = " ".join(tokens)
            english_tokens.append(self.about_us)

        if self.startups:
            self.startups['cb_description'].replace(to_replace=self.startups['cb_description'].unique(), value=english_tokens, inplace=True)
            return self.startups
        elif self.industries:
            self.industries['keywords'].replace(to_replace=self.industries['keywords'].unique(), value=english_tokens, inplace=True)
            return self.industries

    def remove_noisy_tokens(self, data=None):
        if data is not None:
            if self.industries:
                self.industries = data.copy()
            else:
                self.startups = data.copy()
        cleaned_about_us = []
        for item in self.__iterate_rows():
            doc = self.nlp(self.about_us)
            tokens = [token.text.lower() for token in doc if
                      not token.is_stop
                      and not token.is_punct
                      and not token.is_space
                      and not token.like_num
                      and not token.is_digit
                      and not token.is_currency
                      and not token.is_bracket
                      and not token.is_quote
                      and not token.is_left_punct
                      and not token.is_right_punct
                      and not token.like_url
                      and not token.like_email]

            self.about_us = " ".join(tokens)
            cleaned_about_us.append(self.about_us)
        if self.industries:
            self.industries['keywords'].replace(to_replace=self.industries['keywords'].unique(), value=cleaned_about_us, inplace=True)
            return self.industries
        elif self.startups:
            self.startups['cb_description'].replace(to_replace=self.startups['cb_description'].unique(), value=cleaned_about_us, inplace=True)
            return self.startups

    def lemma(self, data=None):
        if data is not None:
            if self.industries:
                self.industries = data.copy()
            else:
                self.startups = data.copy()
        lemmatized_about_us = []
        for description in self.__iterate_rows():
            doc = self.nlp(self.about_us)
            tokens = [token.lemma_ for token in doc]
            self.about_us = " ".join(tokens)
            lemmatized_about_us.append(" ".join(tokens))
        if self.industries:
            self.industries['keywords'].replace(to_replace=self.industries['keywords'].unique(), value=lemmatized_about_us, inplace=True)
            return self.industries
        elif self.startups:
            self.startups['cb_description'].replace(to_replace=self.startups['cb_description'].unique(), value=lemmatized_about_us, inplace=True)
            return self.startups

    def delete_ents(self, text):
        print(len(text.split()))
        doc = self.nlp(text)
        ents = [ent.text for ent in doc.ents]
        for ent in ents:
            text = text.replace(ent, "")
        print(len(text.split()))
        return text





In [10]:
preprocess = TextProcessing(raw_industries, industry=True)
english_tokens = preprocess.remove_non_english_tokens()
noise_reduce = preprocess.remove_noisy_tokens()
industries = preprocess.lemma()


NameError: name 'spacy' is not defined

In [7]:
class EDA:
    def __init__(self):
        pass

startups = pd.read_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\processed\startups_clean_noents.csv')
startups.dropna(inplace=True)



To inform my assumption that overly small and overly large descriptions are not useful, I will plot the distribution of the lengths of the descriptions. Its important to find a range where the data is not too sparse and not too dense. I will use a range of 30 to 150 words for the descriptions. as a baseline and alter the distributions from there.

It follows logically that the cleaned dataset will have a higher bias towards lower description lengths. This is because the cleaning process removes words that are not useful for the model. This is a good thing as it will help the model to focus on the important words in the description.

After playing with the range a little I concluded that the range of 15-60 words is the best range for the descriptions. It includes most of the descriptions, in the cleaned dataset (2500), and after looking visually at some entries with 15 words, I concluded that it is enough to understand the type of company (at least for a human). The distribution is also centered around the mean which is a good thing.

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

datasets = {'raw': raw_startups, 'cleaned': startups}
for i, dataset_name in enumerate(datasets.keys()):
    dataset = datasets[dataset_name]
    about_us_lengths = [len(s.split()) for s in dataset['cb_description'] if 15 < len(s.split()) < 60]
    mean, std = norm.fit(about_us_lengths)
    ax[i].hist(about_us_lengths, bins=60, density=True, alpha=0.6, color='y')
    xmin, xmax = ax[i].get_xlim()
    x = np.linspace(xmin, xmax)
    p = norm.pdf(x, mean, std)
    ax[i].plot(x, p, 'k', linewidth=4)
    title = f"{dataset_name} dataset: mu = {mean.round()},  std = {std.round()} n= {len(about_us_lengths)}"
    ax[i].set_title(title)

plt.show()

In [8]:
preprocess = TextProcessing(raw_startups, startup=True)
about_us_lengths = preprocess.length_range(startups, length_range=(15, 60))

ranged_startups = startups[startups['id'].isin(about_us_lengths.keys())]

I now have a dataframe that contains the cleaned descriptions of 2512 startups that fall within a range of 15-60 words in length per description. Overly long or short ones were excluded, and I imagine for my TFIDF model, I will likely use the first 20-30 words as a proxy for the category. This will limit complexity further as looking at the data, the main concepts seem to be described in the first sentence or two.

This is not always the case though but works fine for a v1 model

In [9]:
ranged_startups.to_csv(path_or_buf=r'C:\Users\imran\DataspellProjects\WalidCase\data\processed\startup_dataset_clean_1560_range.csv', index=False)

In [10]:
ranged_startups

Unnamed: 0.1,Unnamed: 0,id,name,cb_description
0,0,1820,0xKYC,modular knowledge system identity credential m...
2,2,3640,10X-Genomics,create revolutionary dna sequence technology h...
3,3,9594,111Skin,commit positive luxury skincare push boundary ...
5,5,473,1stdibs,internet company offer marketplace rare desira...
6,6,7956,1v1Me,application allow user play match favorite vid...
...,...,...,...,...
3992,3992,2649,RoomLab,believe great interior design accessible avail...
3995,3995,6882,Rosaly,give ability manage advance payment request au...
3996,3996,4394,Roslin-Technologies,mission improve protein production disruptive ...
3997,3997,1036,Rossum,solve key step document base process receive d...
