# ML Script for LEL to Analyze a Startup using Webscraping

## Section 0 - Download Necessary Packages

In [None]:
#Download Necessary Packages
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install nltk
!pip install requests
!pip install bs4
!pip install openpyxl
!pip install selenium
!pip install bs4
!pip install time
!pip install re
!pip install tqdm
!pip install contractions
!pip install sklearn
!pip install tensorflow

## Section 1 - Import Necessary Packages

In [None]:
#Load Necessary Packages
import pandas as pd
import numpy as np
import seaborn as sns
import requests
import openpyxl
import time
import re
import contractions
import sys
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
from scipy.stats import norm
import statistics

import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib.ticker as ticker
from matplotlib.font_manager import FontProperties
import matplotlib.patheffects as path_effects

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

plt.style.use('ggplot')

## Section 2 - Defining Function which loads ML Model
### When called, you will be prompted and given the ability to analyze a startup of your choice. You have to input the filepath of where the model is saved yourself. Example filepath: \Users\akshay\Desktop\xgb_model.model

In [3]:
def analyze_a_startup(filepath):

    # Load the saved model
    loaded_model = xgb.Booster()
    loaded_model.load_model(f'{filepath}')
    
    options = Options()
    driver = webdriver.Chrome()
    
    name = input('What is the name of the startup being analyzed?')
    
    nameformat = False
    try:
        newname = add_dashes(name)
        nameformat = True
    except:
        print('Error Occured with function add_dashes, manual input required')
    
    url = "https://www.cbinsights.com/company/" + newname + "/financials"
    
    urlcheck = False
    try:
        driver.get(url)
        html = driver.page_source
        urlcheck = True
    except:
        print('Error Occured: URL not found on CBInsights')
        sys.exit(1)
    
    manual = False
    if nameformat is True and urlcheck is True:
        ask = input("Would you like to manually input the introduction and industry of " + name + " (alternative is it being done automatically by the system)? Type Y or N")
        if ask == 'Y':
            manual = True
        else:
            manual = False
    
    
    if nameformat is False or urlcheck is False or manual is True:
        if nameformat is False or urlcheck is False:
            print('Error: Manual input will be required for all categories')
        introduction = input('Provide a brief introduction for ' + name)
        industry = input('Provide the industry of ' + name)
        school = input('Rank the school of the team of ' + name + ' on a scale of 0-5')
        corporate = input('Rank the corporate experience of the team of ' + name + ' on a scale of 0-5')
        startup = input('Rank the startup experience of the team of ' + name + ' on a scale of 0-5')
        tech = input('Rank the tech experience of the team of ' + name + ' on a scale of 0-5')
        productstage = input('Rank the product stage of ' + name + ' on a scale of 0-5')
        clientstage = input('Rank the user/client stage of ' + name + ' on a scale of 0-5')
        revenue = input('Rank the revenue of ' + name + ' on a scale of 0-5')
        SUM = int(float(school)) + int(float(corporate)) + int(float(startup)) + int(float(tech)) + int(float(productstage)) + int(float(clientstage)) + int(float(revenue))
        comments = input('Provide comments/additional notes for ' + name)
    
    else:
        
        introcheck = False
        try:
            description_start = html.find('png","description":"') + len('png","description":"')
            description_end = html.find('","url"', description_start)
            introduction = html[description_start:description_end]
            introcheck = True
        except:
            pass
            
        industrycheck = False
        try:
            industry_start = html.find(',"subindustry":"') + len(',"subindustry":"')
            industry_end = html.find('","idSector')
            curindustry = html[industry_start:industry_end]
            curindustry = re.sub(r'\\u[0-9a-fA-F]{4}', lambda x: chr(int(x.group()[2:], 16)), curindustry)
            if len(curindustry) < 40 and len(curindustry) > 5:
                industry = curindustry
            industrycheck = True
        except:
            pass
    
        if introcheck is False:
            introduction = input('Provide a brief introduction for ' + name)
        
        if industrycheck is False:
            industry = input('Provide the industry of ' + name)
    
        school = input('Rank the school of the team of ' + name + ' on a scale of 0-5')
        corporate = input('Rank the corporate experience of the team of ' + name + ' on a scale of 0-5')
        startup = input('Rank the startup experience of the team of ' + name + ' on a scale of 0-5')
        tech = input('Rank the tech experience of the team of ' + name + ' on a scale of 0-5')
        productstage = input('Rank the product stage of ' + name + ' on a scale of 0-5')
        clientstage = input('Rank the user/client stage of ' + name + ' on a scale of 0-5')
        revenue = input('Rank the revenue of ' + name + ' on a scale of 0-5')
        SUM = int(float(school)) + int(float(corporate)) + int(float(startup)) + int(float(tech)) + int(float(productstage)) + int(float(clientstage)) + int(float(revenue))
        comments = input('Provide comments/additional notes for ' + name)
    
    startupdict = { 
        'Brief Introduction' : introduction, 
        'Industry/Sector' : industry, 
        'School (0-5)' : school, 
        'Corporate Experience/\nTop Organization (0-5)' : corporate, 
        'Startup Experience (0-5)' : startup, 
        'Tech Background (0-5)' : tech,
        'Product Stage (0-5)' : productstage,
        'User/Client Stage (0-5)' : clientstage,
        'Revenue (0-5)' : revenue,
        'SUM' : SUM,
        'Comments/Additional Notes' : comments}
    
    numeric_features = np.array([
        startupdict['School (0-5)'],
        startupdict['Corporate Experience/\nTop Organization (0-5)'],
        startupdict['Startup Experience (0-5)'],
        startupdict['Tech Background (0-5)'],
        startupdict['Product Stage (0-5)'],
        startupdict['User/Client Stage (0-5)'],
        startupdict['Revenue (0-5)'],
        startupdict['SUM']
    ]).reshape(1, -1)
    
    scaler = StandardScaler()
    scaler.fit(numeric_features)
    numeric_features = scaler.transform(numeric_features)
    
    # Preprocess textual features
    text_data = {
        'Brief Introduction': startupdict['Brief Introduction'],
        'Industry/Sector': startupdict['Industry/Sector']
    }
    text_columns = ['Brief Introduction', 'Industry/Sector']
    text_sequences_list = []
    
    for col in text_columns:
        text_sequences = tokenizer.texts_to_sequences([text_data[col]])
        text_sequences = pad_sequences(text_sequences, maxlen=40)
        text_sequences_list.append(text_sequences)
    
    # Combine processed textual features into a single array
    text_sequences_combined = np.concatenate(text_sequences_list, axis=1)
    
    # Preprocess additional features
    has_comments = 1 if pd.notna(startupdict['Comments/Additional Notes']) else 0
    comments_length = len(str(startupdict['Comments/Additional Notes']))
    additional_features = np.array([[has_comments, comments_length]])
    
    # Combine all the features into X
    X = np.concatenate([text_sequences_combined, numeric_features, additional_features], axis=1)
    
    dmatrix = xgb.DMatrix(X)
    
    # Make predictions using the loaded model
    predictions = loaded_model.predict(dmatrix)
    
    
    #Print the predictions
    print(bold(name))
    print('Probability of startup not being a promising investment (0): ' + str(predictions[0][0] * 100) + "%")
    print('Probability of startup being a potentially promising investment (1): ' + str(predictions[0][1] * 100) + "%")
    print('Probability of startup being a very promising investment (2): ' + str(predictions[0][2] * 100) + "%")
    new_predictions = np.argmax(np.array(predictions))
    if int(new_predictions) == 0:
        print(bold('Ranking given: 0.'))
        print('The model does not consider this startup to be a promising investment.')
    elif int(new_predictions) == 1:
        print(bold('Ranking given: 1.'))
        print('The model considers this startup to be a potentially promising investment.')
    elif int(new_predictions) == 2:
        print(bold('Ranking given: 2.'))
        print('The model considers this startup to be a very promising investment.')
    else:
        print(bold('Ranking given: ') + new_predictions)
    
    print('\n' + bold('Introduction: ') + introduction)
    print(bold('Industry: ') + industry)
    print('\n' + bold('School (0-5): ') + school)
    print(bold('Corporate Experience, Top Organization (0-5): ') + corporate)
    print(bold('Startup Experience (0-5): ') + startup)
    print(bold('Tech Background (0-5): ') + tech)
    print(bold('Product Stage (0-5): ') + productstage)
    print(bold('Client Stage (0-5): ') + clientstage)
    print(bold('Revenue (0-5): ') + revenue)
    print(bold('\n' + 'SUM: ') + str(SUM))
    print(bold('Comments/Additional Notes: ') + comments)

## Section 3 - Call Function below