In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Dependancies

In [None]:
### Data handling libraries ###

import numpy as np
import pandas as pd
import os

### Plotting Libraries ### 

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,8)

### Date Time ###

import datetime
import time
import pytz

### Warnings ###
import warnings
warnings.filterwarnings('ignore')

### Progress Bar ###
from tqdm import tqdm

### Model Building, Model Evaluvation, Model Preprocessing ###

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

# Models Imbalance # 

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# ML MODELS #

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

# Scoring Dependancies #

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
# from sklearn.metrics import average_precision_score,make_scorer
from sklearn.model_selection import cross_val_score, cross_validate, KFold
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix

# Models Saving #

import pickle

# Other #
from collections import Counter
from sklearn.utils import shuffle

# Constants

In [None]:
PATH = r'/content/gdrive/My Drive/MLPipeLine'
READFILE = 'investments_VC.csv'
ROWS = 0
COLUMNS = 0
SEARCHFIT = 0
LISTCOLUMNNAME= []
LISTFUNDINGCOL = ['seed', 'venture',
       'equity_crowdfunding', 'undisclosed', 'convertible_note',
       'debt_financing', 'grant', 'private_equity', 'post_ipo_equity',
       'post_ipo_debt', 'secondary_market', 'product_crowdfunding', 'round_A',
       'round_B', 'round_C', 'round_D', 'round_E', 'round_F', 'round_G',
       'round_H']

MAINDF = pd.read_csv(os.path.join(PATH, READFILE), encoding='latin1')
LISTCOLUMNNAME = list(MAINDF.columns)
ROWS, COLUMNS = MAINDF.shape

# Functions

In [None]:
def funcCustomCVScore(fncp_X_train, fncp_y_train, fncpKFold,fncpBaseModel, fncpBaseModelParam=None, fncpRandomState=123, fncpScoreAverage='weighted'):

  '''
  This function splits the X_train and y_train into folds for cross calulating recall, precesion and f1 score's of each fold and returns the scores 
  and prints the mean score and the 95% confidence interval of the score estimate.

  input:
     fncp_X_train - X_train
     fncp_y_train - y_train
     fncpKFold - No of folds
     fncpBaseModel - Base model. Ex: RandomForestClassifier
     (Optional) (dict) - Parameters to be used in the base model
     (Optional)  fncpRandomState
     (Optional) fncpScoreAverage
  output:
    recallScores
    precisionScores
    f1Scores

  '''
  kfold = KFold(n_splits=fncpKFold, random_state=fncpRandomState)
  recallScores = []
  precisionScores = []
  f1Scores = []
  for train_index, test_index in tqdm(kfold.split(fncp_X_train)):
    cv_X_train = fncp_X_train[fncp_X_train.index.isin(train_index)]
    cv_X_test = fncp_X_train[fncp_X_train.index.isin(test_index)]

    cv_y_train = fncp_y_train[fncp_y_train.index.isin(train_index)]
    cv_y_test = fncp_y_train[fncp_y_train.index.isin(test_index)]

    if fncpBaseModelParam == None:
      model = fncpBaseModel()
    else:
      model = fncpBaseModel(**fncpBaseModelParam)
    model.fit(cv_X_train,cv_y_train)

    tempScore = round(recall_score(cv_y_test, model.predict(cv_X_test), average=fncpScoreAverage)*100,2)
    precisionScores.append(tempScore)

    tempScore = round(precision_score(cv_y_test, model.predict(cv_X_test), average=fncpScoreAverage)*100,2)
    recallScores.append(tempScore)

    tempScore = round(f1_score(cv_y_test, model.predict(cv_X_test), average=fncpScoreAverage)*100,2)
    f1Scores.append(tempScore)
  print('\n')
  print(f'The mean score and the 95% confidence interval of the score estimate are')
  print("Recall: %0.2f (+/- %0.2f)" % (np.array(recallScores).mean(), np.array(recallScores).std() * 2))
  print("Precision: %0.2f (+/- %0.2f)" % (np.array(precisionScores).mean(), np.array(precisionScores).std() * 2))
  print("F1-Score: %0.2f (+/- %0.2f)" % (np.array(f1Scores).mean(), np.array(f1Scores).std() * 2))
  return recallScores, precisionScores, f1Scores

In [None]:
def funcCustomCVScore(fncp_X_train, fncp_y_train, fncpKFold,fncpBaseModel, fncpBaseModelParam=None, fncpRandomState=123, fncpScoreAverage='weighted'):

  '''
  This function splits the X_train and y_train into folds for cross calulating recall, precesion and f1 score's of each fold and returns the scores 
  and prints the mean score and the 95% confidence interval of the score estimate.

  input:
     fncp_X_train - X_train
     fncp_y_train - y_train
     fncpKFold - No of folds
     fncpBaseModel - Base model. Ex: RandomForestClassifier
     (Optional) (dict) - Parameters to be used in the base model
     (Optional)  fncpRandomState
     (Optional) fncpScoreAverage
  output:
    recallScores
    precisionScores
    f1Scores

  '''
  kfold = KFold(n_splits=fncpKFold, random_state=fncpRandomState)
  recallScores = []
  precisionScores = []
  f1Scores = []
  for train_index, test_index in tqdm(kfold.split(fncp_X_train)):
    cv_X_train = fncp_X_train[fncp_X_train.index.isin(train_index)]
    cv_X_test = fncp_X_train[fncp_X_train.index.isin(test_index)]

    cv_y_train = fncp_y_train[fncp_y_train.index.isin(train_index)]
    cv_y_test = fncp_y_train[fncp_y_train.index.isin(test_index)]

    if fncpBaseModelParam == None:
      model = fncpBaseModel()
    else:
      model = fncpBaseModel(**fncpBaseModelParam)
    model.fit(cv_X_train,cv_y_train)

    tempScore = round(recall_score(cv_y_test, model.predict(cv_X_test), average=fncpScoreAverage)*100,2)
    precisionScores.append(tempScore)

    tempScore = round(precision_score(cv_y_test, model.predict(cv_X_test), average=fncpScoreAverage)*100,2)
    recallScores.append(tempScore)

    tempScore = round(f1_score(cv_y_test, model.predict(cv_X_test), average=fncpScoreAverage)*100,2)
    f1Scores.append(tempScore)
  print('\n')
  print(f'The mean score and the 95% confidence interval of the score estimate are')
  print("Recall: %0.2f (+/- %0.2f)" % (np.array(recallScores).mean(), np.array(recallScores).std() * 2))
  print("Precision: %0.2f (+/- %0.2f)" % (np.array(precisionScores).mean(), np.array(precisionScores).std() * 2))
  print("F1-Score: %0.2f (+/- %0.2f)" % (np.array(f1Scores).mean(), np.array(f1Scores).std() * 2))
  return recallScores, precisionScores, f1Scores

In [None]:
def funcPreprocessing(fncpDF):
    
    '''
    This function
    1) Removes leading and trailing white spaces in the column names.
    2)Removes leading and trailing white spaces in the values present in object datatype columns.
    
    input
        Dataframe
    output
        None
    '''

    dictCol = {}
    print('Column preprocessing...')
    for col in fncpDF.columns:
        dictCol[col] = col.strip()
    fncpDF.rename(columns=dictCol, inplace=True) 
    print('Object datatype preprocessing...\n')
    for col in fncpDF.columns:
        if fncpDF[col].dtype == 'object':
            fncpDF[col] = fncpDF[col].str.strip()
    print('Sucessfully preprocessed the dataframe!')   

In [None]:
def fncpModelEvaluvate(fncpActual, fncpPredicted, fncpBoolHeatMap=False, fncpMultiClass=True,fncpAverageType='weighted'):
  '''
  This function prints the various evaluvation metric of a models and also prints the confusion matrix
  input:
    fncpActual - Actual Values
    funPredictedValues - Predicted Values
    (optional) (bool) fncpBoolHeatMap - To display or not display confusion matrix
    (optional) (bool) fncpMultiClass - Is it a multiclass problem or binary class problem
    (optional) (bool) fncpAverageType - Average type for multiclass problem   
  '''

  # Heat Map #
  if  fncpBoolHeatMap == True:
    cf_matrix = confusion_matrix(fncpActual, fncpPredicted)
    make_confusion_matrix(cf_matrix, figsize=(8,6), cbar=True, cmap='BrBG')
    print('\n\n')

  print('Evaluation Metrics\n')
  # print(f'Accuracy Score :{round(accuracy_score(fncpActual, fncpPredicted)*100,2)}%')
  if fncpMultiClass == True:
    print(f'Recall Score :{round(recall_score(fncpActual, fncpPredicted, average=fncpAverageType)*100,2)}%')
    print(f'Precision Score :{round(precision_score(fncpActual, fncpPredicted, average=fncpAverageType)*100,2)}%')
    print(f'F1 Score :{round(f1_score(fncpActual, fncpPredicted, average=fncpAverageType)*100,2)}%')
  else:
    print(f'Recall Score :{round(recall_score(fncpActual, fncpPredicted)*100,2)}%')
    print(f'Precision Score :{round(precision_score(fncpActual, fncpPredicted)*100,2)}%')
    print(f'F1 Score :{round(f1_score(fncpActual, fncpPredicted)*100,2)}%')


In [None]:
def funcFeatureImportance(fncpModel, fncpTrainSet, fncpCV=True):

  '''
  This function prints the top 20 and bottom 20 important features and returns an dataframe with important features sorted in descending order
  input:
    (model) fncpModel
    (dataframe) fncpTrainSet
    (optional) (bool) fncpCV
  ouput:
    dataframe
  '''
  if fncpCV == True:
    feature_importances = pd.DataFrame(fncpModel.best_estimator_.feature_importances_,
                                      index = fncpTrainSet.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)
  else:
    feature_importances = pd.DataFrame(fncpModel.feature_importances_,
                              index = fncpTrainSet.columns,
                                columns=['importance']).sort_values('importance', ascending=False)                         
  print("Top 20 Important Feature\n")
  print(feature_importances.head(20))
  print('\n')
  print("Bottom 20 Important Feature\n")
  print(feature_importances.tail(20))
  return feature_importances

In [None]:
def funcDescription(fncpDF, fncpPrnt=False):
    
    '''
    This function prints some of the general characterstics of the dataset and returns a list of tuples about the columns.
    input: 
        DataFrame
        (bool) (optional) To print or not
    output: 
        List of general characterstics
    '''
    
    lstTemp = []
    for col in tqdm(fncpDF.columns):
        if fncpDF[col].dtype == 'object':
            colUniqueValue = fncpDF[col].nunique()
            colMax = 0
            colMin = 0
            colMean = 0
            colType = 'object'
            lstTemp.append((col.strip(), colUniqueValue, colMax,colMin,round((fncpDF[col].isna().sum()/ROWS)*100, 2), colType))
        else:
            colUniqueValue = 0
            colMax = fncpDF[col].max()
            colMin = fncpDF[col].min()
            colMean = round(fncpDF[col].mean(),2)
            colType = 'float'
            lstTemp.append((col, colUniqueValue, colMax,colMin,round((fncpDF[col].isna().sum()/ROWS)*100, 2), colType))           
    lstTemp = sorted(lstTemp, key=lambda x: x[4], reverse=True)
    if fncpPrnt:
        print(f'Total no of rows : {fncpDF.shape[0]} \nTotal no of columns : {fncpDF.shape[1]}')
        totalNaNRows = fncpDF[fncpDF.isna().any(axis=1)].shape[0]
        print(f'Rows with atleast one of the columns with a NaN value : {totalNaNRows}')
        print('\n')
        for item in lstTemp:
            print(f'------------------- {item[0]}-------------------')
            print(f'Unique values in column : {item[1]}')
            print(f'Max value in column : {item[2]}')
            print(f'Min value in column : {item[3]}')
            print(f'Mean Value in column : {item[4]}')
            print('\n')
    return lstTemp

In [None]:
### Date preprocessing ###

def funcDateManipulation(fncpDF, fncpColName, funcStartsWith='00'):
    '''
    The expected date format is yyyy-mm-dd but few of the dates are not in this format. This function changes the value to the / 
    specified format. The startsWith searches for all the dates that begin with 00 and modifies them into the required format.
    
    input: 
        DataFrame
        (str) fncpColName, 
        (str) (optional) funcStartsWith
    output: 
        None
    '''
    print(f'Processing column {fncpColName}....')
    lstTemp = fncpDF.loc[fncpDF[fncpColName].str.startswith(funcStartsWith, na=False), [fncpColName]].values.tolist()
    print(lstTemp)
    for dte in lstTemp:
        year = str(dte[0][-2:])
        month = str(dte[0][5:7])
        day = str(dte[0][2:4])
        newDate = '20'+year+'-'+month+'-'+day
        fncpDF.loc[fncpDF[fncpColName].str.contains(dte[0], na=False),[fncpColName]] = newDate
    print(f'Sucessfully changed the date for {fncpColName} variable')

In [None]:
def funcCustomOneHotEncode(fncpDF, fncpColName,fncpTop=5, fncpBottom=5, fncpMiddle=10, fncpDropOriginal=True):
    
    '''
    This function takes the fncpTop, fncpMiddledle and fncpBottom most occuring values in a feature and one hot encodes the values.
    input: 
        DataFrame, 
        (str) Variable/Feature,
        (int) (optional) fncpTop n categories, 
        (int) (optional) fncpBottom n categories, 
        (int) (optional) fncpMiddle n categories, 
        (bool) (optional) To drop original variable
    output: 
        1 (Success) 
        -1 (Failure)
    '''
    
    if fncpColName not in fncpDF.columns:
        return -1
    
    uniqueMarketsCount = len(fncpDF[fncpColName].unique())
    print(f'There are in total {uniqueMarketsCount} unique categories in {fncpColName} feature \n')

    # Top X
    listfncpTopXCategories = []
    listfncpTopXCategories = fncpDF[fncpColName].value_counts().sort_values(ascending = False).head(fncpTop).index.tolist()
    if 'Unknown' in listfncpTopXCategories:
        listfncpTopXCategories = fncpDF[fncpColName].value_counts().sort_values(ascending = False).head(fncpTop + 1).index.tolist()
        listfncpTopXCategories.remove('Unknown') # Removing the 'Unknown' market since it is a imputed value
    print(f'fncpTop {len(listfncpTopXCategories)} markets are:\n ')
    print({*listfncpTopXCategories}, sep = ", ")

    # Mid X
    listfncpMiddleXCategories = []
    startPos = int(uniqueMarketsCount/2)
    endPos = startPos + fncpMiddle
    listfncpMiddleXCategories = fncpDF[fncpColName].value_counts().sort_values(ascending = True)[startPos:endPos].index.tolist()
    if 'Unknown' in listfncpMiddleXCategories:
        listfncpMiddleXCategories = fncpDF[fncpColName].value_counts().sort_values(ascending = True)[startPos:endPos + 1].index.tolist()        
        listfncpMiddleXCategories.remove('Unknown') # Removing the 'Unknown' market since it is a imputed value
    print(f'\nfncpMiddle {len(listfncpMiddleXCategories)} markets are:\n ')
    print({*listfncpMiddleXCategories}, sep = ", ")

    # Bottom X
    listfncpBottomXCategories = []
    listfncpBottomXCategories = fncpDF[fncpColName].value_counts().sort_values(ascending = True).head(fncpBottom).index.tolist()
    if 'Unknown' in listfncpTopXCategories:
        listfncpBottomXCategories = fncpDF[fncpColName].value_counts().sort_values(ascending = True).head(fncpBottom + 1).index.tolist()
        listfncpBottomXCategories.remove('Unknown') # Removing the 'Unknown' market since it is a imputed value
    print(f'\nfncpBottom {len(listfncpBottomXCategories)} markets are:\n ')
    print({*listfncpBottomXCategories}, sep = ", ")

    listFinalXCategories = listfncpTopXCategories + listfncpMiddleXCategories + listfncpBottomXCategories
    for label in listFinalXCategories:
      fncpDF['f_'+fncpColName+'_'+label] = np.where(fncpDF[fncpColName] == label,1,0)
    
    if fncpDropOriginal == True:
        fncpDF.drop(fncpColName, axis=1, inplace=True)
    print(f'Sucessfully implemented custom one hot encoding for {fncpColName} variable...')
    return listFinalXCategories

In [None]:
def funcColumnsToDrop(fncpDF, fncpLstColToDrop = []):
    
    '''
    This function drops a list of column(s) in the dataframe and returns the new unique columns in the dataframe
    input: Dataframe, list of coulumns to drop
    output: list of new columns (Sucess) / -1 (Failure)
    '''
    
    if len(fncpLstColToDrop) > 0 and set(fncpLstColToDrop).issubset(fncpDF.columns):
        fncpDF.drop(fncpLstColToDrop, axis=1, inplace=True) 
        return fncpDF.columns
    else:
        print('No columns to drop or one of the columns present in the list is not available in the dataframe')
        return -1

In [None]:
def funcHeatMap(funcpX, funcpY, funcpSize):
  '''
  This function creates a custom heatmap.
  Please refer https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec for in depth explanation of this function.
  '''

  fig, ax = plt.subplots()
  
  # Mapping from column names to integer coordinates
  x_labels = [v for v in sorted(funcpX.unique())]
  y_labels = [v for v in sorted(funcpY.unique())]
  x_to_num = {p[1]:p[0] for p in enumerate(x_labels)} 
  y_to_num = {p[1]:p[0] for p in enumerate(y_labels)} 
  
  size_scale = 270
  ax.scatter(
      x=funcpX.map(x_to_num), # Use mapping for x
      y=funcpY.map(y_to_num), # Use mapping for y
      s=funcpSize * size_scale, # Vector of square sizes, proportional to size parameter 
      c='green',
      alpha=0.8,
      marker='s' # Use square as scatterplot marker
  )
  
  # Show column labels on the axes
  ax.set_xticks([x_to_num[v] for v in x_labels])
  listTempXLabel = [name.replace('_', ' ') for name in x_labels]
  ax.set_xticklabels(listTempXLabel, rotation=90, horizontalalignment='right')
  ax.set_yticks([y_to_num[v] for v in y_labels])
  listTempYLabel = [name.replace('_', ' ') for name in y_labels]
  ax.set_yticklabels(listTempYLabel)
  ax.tick_params(axis='x', colors='white',labelsize=12)
  ax.tick_params(axis='y', colors='white')
  ax.grid(False, 'major')
  ax.grid(True, 'minor')
  ax.set_xticks([t + 0.5 for t in ax.get_xticks()], minor=True)
  ax.set_yticks([t + 0.5 for t in ax.get_yticks()], minor=True)
  ax.set_xlim([-0.5, max([v for v in x_to_num.values()]) + 0.5]) 
  ax.set_ylim([-0.5, max([v for v in y_to_num.values()]) + 0.5])

# Data Preprocessing

#### Strippping white spaces

In [None]:
### Renaming the columns to remove the white spaces ###    
dfProcessed = MAINDF.copy()
funcPreprocessing(dfProcessed)
LISTCOLUMNNAME = dfProcessed.columns

#### Dropping Rows and Columns ###


In [None]:
### Checking and deleting rows where all the values are NaN ###

allNAValues = dfProcessed[dfProcessed.isna().all(axis=1)].shape[0]
print(f'There are {allNAValues} rows that have only NaN as values')
dfProcessed = dfProcessed.dropna(how='all')
dfProcessed = dfProcessed.reset_index(drop=True)

In [None]:
### Checking duplicate records in a dataframe ###

duplicateRecords = dfProcessed[dfProcessed.duplicated()].shape[0]
if duplicateRecords == 0:
  print(f'There are {duplicateRecords} duplicate records in the dataframe')
else:
  print(f'{duplicateRecords} duplicate records are dropped from the dataframe')
  dfProcessed = dfProcessed[~dfProcessed.duplicated()]

In [None]:
### Dropping rows where target variable 'status' is NaN ### 

targetNAValues = dfProcessed.loc[dfProcessed['status'].isna() == True,].shape[0]
print(f'There are {targetNAValues} rows for which target variable has NaN as values')
dfProcessed = dfProcessed.loc[dfProcessed['status'].isna() == False,]

In [None]:
### Removing the rows for which we dont have total funding and the breakup of the funding is null as well ###

print('The list of variables associated with funding are \n', LISTFUNDINGCOL)
dfProcessed['f_SumCol'] = dfProcessed[LISTFUNDINGCOL].sum(axis = 1)
rowsTotalSumCheck = dfProcessed.loc[(dfProcessed['funding_total_usd'] == '-'),['funding_total_usd']].shape[0]
print(f'There are {rowsTotalSumCheck} rows whose total funding is none ("-")')
rowsTotalSumCheck = dfProcessed.loc[(dfProcessed['funding_total_usd'] == '-') & (dfProcessed['f_SumCol'] > 0),].shape[0]
print(f'There are {rowsTotalSumCheck} rows whose total funding is none but sum of the total funding is more than zero')

### Selecting only the rows for which the variable funding_total_usd is not equal to '-' ###
dfProcessed = dfProcessed.loc[~(dfProcessed['funding_total_usd'] == '-'),]

In [None]:
# ### ROWS WITH HIGH NaN percetage ###

lstHighNanRow = []
for i in tqdm(range(len(dfProcessed.index)), desc='Rows processed'):
    temp = round((dfProcessed.iloc[i].isnull().sum()/dfProcessed.shape[1])*100,2)
    if temp >= 80:
      lstHighNanRow.append(i)
print(f'\nNumber of rows deleted is {len(lstHighNanRow)}')
dfProcessed = dfProcessed.drop(dfProcessed.index[lstHighNanRow])

In [None]:
### Removing permalink and name since this would not contribute to the model ###

lstDropColumns = ['permalink', 'name']
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

In [None]:
### Dropping region based columns ###

# Replacing Nan with 'Unknown' where all the region based columns have NaN values 
lstcolumns = ['country_code', 'state_code','region', 'city']
dfProcessed.loc[(dfProcessed[lstcolumns].isna().all(axis=1) == True), lstcolumns] = 'Unknown'

txt = 'Total Null values is '
print('country_code')
print(txt, dfProcessed[ 'country_code'].isnull().sum())
print('\n')
print('state_code')
print(txt, dfProcessed[ 'state_code'].isnull().sum())
print('\n')
print('region')
print(txt, dfProcessed[ 'region'].isnull().sum())
print('\n')
print('city')
print(txt, dfProcessed[ 'city'].isnull().sum())
print('\n')

print(dfProcessed.loc[(dfProcessed['state_code'].isnull() == False), lstcolumns]['country_code'].value_counts())

# Since the state_code has the maximum number of null values and remaing non-null values belong to US and Canada,
# we will be removing the state_code variable

lstDropColumns = ['state_code']
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

In [None]:
### Dropping City column ###

uniqueCityCount = len(dfProcessed['city'].unique())
uniqueRatioPercent = round(uniqueCityCount/len(dfProcessed['city'])*100,2)
print(f'There are {uniqueCityCount} unique cities and hence dropping the column since uniquness ratio is low ({uniqueRatioPercent})')
lstDropColumns = ['city']
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

In [None]:
### Removing funding_total_usd since this is just an addition of all the funding columns###

lstDropColumns = ['funding_total_usd'] #, 'f_SumCol']
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

### Combining Seed investment and Angel Investment into one since both are the same.

dfProcessed['seed'] = dfProcessed['seed'] + dfProcessed['angel']
lstDropColumns = ['angel']
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

# Exploratory Data Analysis



In [None]:
listColCharacterstics = funcDescription(MAINDF, fncpPrnt=False)

In [None]:
### Imbalanced Dataset ###


plt.figure(figsize = (10,5), facecolor = 'black', dpi=70) 
sns.set(rc={'axes.facecolor':'black', 
            'figure.facecolor':'black', 
            'axes.spines.top': False

            })
ax = dfProcessed['status'].value_counts(dropna = False).plot(kind='bar', color = 'green', alpha = 0.8)
ax.set_xlabel('')
ax.set_ylabel('CATEGORYWISE COUNT')
ax.set()
ax.yaxis.label.set_color('white')
ax.xaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white',labelsize=12)
ax.tick_params(axis='y', colors='white',labelsize=12)
ax.tick_params(axis='y', colors='white')
sns.despine()
plt.xticks(rotation=45)
plt.show()

In [None]:
### Status VS Fundings ###

print(LISTFUNDINGCOL)

fig = plt.figure(figsize = (21,21), facecolor = 'black', dpi=70) 

for c,num in zip(LISTFUNDINGCOL, range(1,22)):
  ax = fig.add_subplot(7,3,num)
  x = np.random.rand()
  y = np.random.rand()
  z = np.random.rand()
  ax =  dfProcessed.groupby('status')[c].agg('mean').plot(kind='bar',alpha = .8, color=(x,y,z))

  ax.set_xlabel('')
  temp = 'Median "' + c.replace('_',' ') + '" value'
  ax.set_ylabel(temp)
  ax.set_title('')
  ax.xaxis.label.set_color('white')
  ax.yaxis.label.set_color('white')
  ax.tick_params(axis='x', colors='white',labelsize=12)
  ax.tick_params(axis='y', colors='white')
  sns.despine()
  plt.xticks(rotation=45)

plt.tight_layout()
plt.subplots_adjust(hspace=0.7, wspace = 0.5)

In [None]:
### Correlation HeatMap ###

listTempCol = LISTFUNDINGCOL.copy()
listTempCol.append('funding_rounds')
corr = dfProcessed[listTempCol].corr()
corr = pd.melt(corr.reset_index(), id_vars='index') # Unpivot the dataframe, so we can get pair of arrays for x and y
corr.columns = ['x', 'y', 'value']
funcHeatMap(
    funcpX=corr['x'],
    funcpY=corr['y'],
    funcpSize=corr['value'].abs()
)

# Feature Engineering

#### Date variable


In [None]:
### Found out while converting the object type to date type ###

wrongDateCount = dfProcessed.loc[dfProcessed['founded_at'].str.contains('1636', na=False),].shape[0]
print(f'There are {wrongDateCount} record(s) whose year is 1636')
dfProcessed = dfProcessed.loc[~dfProcessed['founded_at'].str.contains('1636', na=False),]

In [None]:
### List of years starting with '00'. This was found while trying to convert the the column datatyp to datetime ###

print(dfProcessed.loc[dfProcessed['first_funding_at'].str.startswith('00', na=False), ['first_funding_at']])
print('\n')
funcDateManipulation(dfProcessed, 'first_funding_at')
print('\n')
print(dfProcessed.loc[dfProcessed['last_funding_at'].str.startswith('00', na=False), ['last_funding_at']])
print('\n')
funcDateManipulation(dfProcessed, 'last_funding_at')

In [None]:
# ## Found out while converting the object type to date type ###

dfProcessed.loc[dfProcessed['first_funding_at'].str.contains('201-01-01', na=False),['first_funding_at']] = '2010-01-01'
dfProcessed.loc[dfProcessed['last_funding_at'].str.contains('201-01-01', na=False),['last_funding_at']] = '2010-01-01'
print('Date sucessfully changed')

In [None]:
### Changing object type to date type ###

dfProcessed['founded_at'] = dfProcessed['founded_at'].astype('datetime64[ns]')
dfProcessed['first_funding_at'] = dfProcessed['first_funding_at'].astype('datetime64[ns]')
dfProcessed['last_funding_at'] = dfProcessed['last_funding_at'].astype('datetime64[ns]')

#### Market variable

In [None]:
### Market variable ###
 
dfProcessed['market'] = dfProcessed['market'].str.strip() 
marketNAValues = dfProcessed.loc[(dfProcessed['market'].isna() == True),].shape[0]
print(f'There are in total {marketNAValues} rows for which the market has NaN value')
dfProcessed.loc[(dfProcessed['market'].isna() == True),['market']] = 'Unknown' # Replacing NaN as UnKnown

In [None]:
### Market variable selecting only top 5, bottom 5 and mid 10 and one-hot encoding###

market_variableOneHotCol = funcCustomOneHotEncode(dfProcessed, 'market')

#### Region Variables

In [None]:
### country_code variable selecting only top 5, bottom 5 and mid 10 and one-hot encoding###

country_code_variableOneHotCol = funcCustomOneHotEncode(dfProcessed, 'country_code')

In [None]:
### region variable selecting only top 5, bottom 5 and mid 10 and one-hot encoding###

region_variableOneHotCol = funcCustomOneHotEncode(dfProcessed, 'region')

#### Funding Rounds

In [None]:
### Correcting total funding_rounds ###

lstCol = ['seed', 'venture', 'equity_crowdfunding', 'undisclosed', 
       'convertible_note', 'debt_financing', 'grant', 'private_equity', 'post_ipo_equity', 
       'post_ipo_debt', 'secondary_market', 'product_crowdfunding', 'round_A', 'round_B', 
       'round_C', 'round_D', 'round_E', 'round_F', 'round_G', 'round_H']
    
print('The list of variables associated with funding are \n', LISTFUNDINGCOL)
dfProcessed['r_Count']=dfProcessed.loc[:,lstCol].ge(1).sum(axis=1) # Counts the value across the rows only if the value is greater than 1 only
incorrectcount = dfProcessed.loc[dfProcessed['r_Count'] != dfProcessed['funding_rounds'], ['r_Count', 'funding_rounds']].shape[0]
print(f'\nThere are {incorrectcount} rows whose to funding rounds do not match with the actual funding rounds')
dfProcessed['funding_rounds'] = dfProcessed['r_Count']
lstDropColumns = ['r_Count']
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

#### Handling Outliers

In [None]:
### Handling outliers if any exist in int and float datatypes ###

dictOutlierTuple = {}

for col in dfProcessed.columns:
    if (((dfProcessed[col].dtype)=='float64') | ((dfProcessed[col].dtype)=='int64')):
        percentiles = dfProcessed[col].quantile([.25, .75]).values
        lowerQuantileValue = percentiles[0]
        upperQuantileValue = percentiles[1]
        dfProcessed.loc[dfProcessed[col] < lowerQuantileValue,col] = lowerQuantileValue
        lowCount = dfProcessed.loc[dfProcessed[col] < lowerQuantileValue,col].shape[0]
        dfProcessed.loc[dfProcessed[col] > upperQuantileValue, col] = upperQuantileValue
        highCount = dfProcessed.loc[dfProcessed[col] > upperQuantileValue, col].shape[0]
        dictOutlierTuple[col] = (lowerQuantileValue, upperQuantileValue, lowCount, highCount)
print(f'Rows affected (LowerQuantile Value, UpperQuantlie Value, LowerQuantlie Rows, Upperquantile Rows)\n\n {dictOutlierTuple}')

#### Category List

In [None]:
### Replaces NaN with single pipe and then checks the number of pipes in the string. If greater than two multi category 
### is marked as 1 and if not marked as 0.

dfProcessed.loc[dfProcessed['category_list'].isna() == True,['category_list']] = '|'
dfProcessed['f_Multi_Category'] = [1 if x.count('|') > 2 else 0 for x in dfProcessed['category_list']] 
lstDropColumns = ['category_list'] 
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

#### Homepage_URL

In [None]:
dfProcessed.loc[dfProcessed['homepage_url'].isna() == True,['homepage_url']] = 'NAN'
dfProcessed['f_URL'] = [1 if len(x) > 3 else 0 for x in dfProcessed['homepage_url']]
lstDropColumns = ['homepage_url'] 
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

#### Year Related Columns

In [None]:
### Checking to impute founded year using the other coulmns

print(dfProcessed.loc[((dfProcessed['founded_month'].isna() == False) & (dfProcessed['founded_at'].isna() == True)) ,['founded_at','founded_month']])
print('\n')
print(dfProcessed.loc[((dfProcessed['founded_quarter'].isna() == False) & (dfProcessed['founded_at'].isna() == True)) ,['founded_at', 'founded_quarter']])
print('\n')
print(dfProcessed.loc[((dfProcessed['founded_year'].isna() == False) & (dfProcessed['founded_at'].isna() == True)) ,['founded_at', 'founded_year']])

In [None]:
### Calculating the age of the company and removing all the companies whose age is greater than 30 years

dfProcessed['r_first_funding_year'] = pd.DatetimeIndex(dfProcessed['first_funding_at']).year
dfProcessed['r_last_funding_year'] = pd.DatetimeIndex(dfProcessed['last_funding_at']).year
dfProcessed['r_founded_year'] = pd.DatetimeIndex(dfProcessed['founded_at']).year

dfProcessed['r_diff'] = dfProcessed['r_first_funding_year'] - dfProcessed['r_founded_year']
dfProcessed['r_diff_mean'] = round(dfProcessed['r_diff'].mean(skipna=True),0)
dfProcessed.loc[dfProcessed['r_founded_year'].isna() == True,['r_founded_year']] = dfProcessed['r_first_funding_year'] - dfProcessed['r_diff_mean']

dfProcessed['f_age'] = 2020 - dfProcessed['r_founded_year']
ageCondRows = dfProcessed.loc[((dfProcessed['f_age'] > 30) | (dfProcessed['f_age'] < 0)),].shape[0]
print(f'There are {ageCondRows} whose age is greater than 30 or less than zero')
dfProcessed = dfProcessed.loc[((dfProcessed['f_age'] <= 30) & (dfProcessed['f_age'] >= 0)),]

In [None]:
### Changing the 'first funding year' whose value is less than 'founded year' to be the same as 'founded year' ###

dfProcessed.loc[dfProcessed['r_first_funding_year'] < dfProcessed['r_founded_year'],['r_first_funding_year']] = dfProcessed['r_founded_year']
dfProcessed['f_yearstoFirstFunding'] = dfProcessed['r_first_funding_year'] - dfProcessed['r_founded_year']

In [None]:
### Changing the 'last funding year' whose value is less than 'first funding year' to be the same as 'first funding year' ###

dfProcessed.loc[dfProcessed['r_last_funding_year'] < dfProcessed['r_first_funding_year'] ,['r_last_funding_year']] = dfProcessed['r_first_funding_year']
dfProcessed['f_FirstFundingToLastFunding'] = dfProcessed['r_last_funding_year'] - dfProcessed['r_first_funding_year']

In [None]:
### Dropping columns related to dates ###

lstDropColumns = ['r_first_funding_year', 'r_last_funding_year', 'r_founded_year', 'r_diff', 'r_diff_mean', 'founded_at', 'founded_at', 'founded_month',
       'founded_quarter', 'founded_year', 'first_funding_at','last_funding_at']
LISTCOLUMNNAME = funcColumnsToDrop(dfProcessed, lstDropColumns)

#### Target Variable

In [None]:
### Label Encoding Target Variable ### 

label_encoder = preprocessing.LabelEncoder() 
dfProcessed['status']= label_encoder.fit_transform(dfProcessed['status']) 

In [None]:
ROWS = dfProcessed.shape[0]
COLUMNS = dfProcessed.shape[1]
print(f'Total number of rows in the final dataset : {ROWS} \nTotal number of columns in the final dataset : {COLUMNS}')

In [None]:
### TRAIN TEST SPLIT ###
X = dfProcessed.loc[:, ~dfProcessed.columns.isin(['status'])] 
y = dfProcessed.loc[:, ['status']] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
del X, y

# Model Building

## Imbalanced dataset

### Dummy Classifier ###


In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")

#### Model Training

In [None]:
dummy_clf.fit(X_train, y_train)

#### Model Prediction

In [None]:
predictedValues = dummy_clf.predict(X_test)
print('Sucessfully predicted the values')

#### Cross Validation score

In [None]:
recallScores, precisionScores, f1Scores = funcCustomCVScore(fncp_X_train=X_train, 
                                                            fncp_y_train=y_train, 
                                                            fncpKFold=10,
                                                            fncpBaseModel=DummyClassifier,
                                                            fncpBaseModelParam={'strategy':'most_frequent'},
                                                            fncpRandomState=123,
                                                            fncpScoreAverage='weighted')

#### Test Model Evaluation

In [None]:
fncpModelEvaluvate(fncpActual=y_test, 
                   fncpPredicted=predictedValues, 
                   fncpBoolHeatMap=False, 
                   fncpMultiClass=True,
                   fncpAverageType='weighted')

### Vanilla Randomforest Classifier


In [None]:
classiRandomForest = RandomForestClassifier()

In [None]:
classiRandomForest.fit(X_train, y_train)
predictedValues = classiRandomForest.predict(X_test)
print('Sucessfully predicted the values')

##### Model Training

In [None]:
classiRandomForest.fit(X_train, y_train)

##### Model Prediction

In [None]:
predictedValues = classiRandomForest.predict(X_test)
print('Sucessfully predicted the values')

##### Cross Validation Score

In [None]:
recallScores, precisionScores, f1Scores = funcCustomCVScore(fncp_X_train=X_train, 
                                                            fncp_y_train=y_train, 
                                                            fncpKFold=10,
                                                            fncpBaseModel=RandomForestClassifier,
                                                            fncpBaseModelParam=None,
                                                            fncpRandomState=123,
                                                            fncpScoreAverage='weighted')

##### Test Model Evaluation

In [None]:
fncpModelEvaluvate(fncpActual=y_test, 
                   fncpPredicted=predictedValues, 
                   fncpBoolHeatMap=False, 
                   fncpMultiClass=True,
                   fncpAverageType='weighted')

### Random forest with optimum parameters

###### Gridsearch Parameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(f'The hyperparameters are \n{random_grid}')

###### Random Search CV Training

In [None]:
start_time = time.time()
model = RandomForestClassifier()
current_time = datetime.datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%d%m%Y_%H%M%S")
modelFileName = PATH+'/Model/cv_model_'+ current_time +'.sav'
print(f'Model training start time : {current_time}\n')
rf_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=100, cv=3, verbose=3, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

# Saving Model #
pickle.dump(rf_random, open(modelFileName, 'wb'))

print(f'Minutes taken to complete training : {(time.time() - start_time)/60}')

In [None]:
### Loading saved model ###
chosenFilePath = r'/content/gdrive/My Drive/AIML/DIY_Programs/MLPipeLine/Model/cv_model_24082020_110100.sav'
loaded_model = pickle.load(open(chosenFilePath, 'rb'))
print('Sucessfully loaded the model!!!')

In [None]:
for key, val in loaded_model.best_params_.items():
  print(f'The best "{key}" hyperparameter is: {val} ')

In [None]:
### Feature Importance ###
dfImportantFeature = funcFeatureImportance(loaded_model, X_train, fncpCV=True)

###### Cross Validation Score

In [None]:
recallScores, precisionScores, f1Scores = funcCustomCVScore(fncp_X_train=X_train, 
                                                            fncp_y_train=y_train, 
                                                            fncpKFold=10,
                                                            fncpBaseModel=RandomForestClassifier, 
                                                            fncpBaseModelParam=loaded_model.best_params_,
                                                            fncpRandomState=123,
                                                            fncpScoreAverage='weighted')

###### Model Prediction

In [None]:
predictedValues = loaded_model.best_estimator_.predict(X_test)
print('Sucessfully predicted the values')

###### Test Model Evaluation


In [None]:
# Test Dataset

fncpModelEvaluvate(y_test, predictedValues, fncpBoolHeatMap=False, fncpMultiClass=True,fncpAverageType='weighted')

## Balanced Dataset

In [None]:
print('Before under and over SMOTE')
counter = Counter(y_train['status'].array)
print(counter)

# define pipeline
dictOver = {0: 12000, 1:12000}
over = SMOTE(sampling_strategy=dictOver)
dictUnder = {2: 20000}
under = RandomUnderSampler(sampling_strategy=dictUnder)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_new, y_new = pipeline.fit_resample(X_train, y_train)
X_new, y_new = shuffle(X_new, y_new) # Shuffles the arrays

print('\n')
print('After under and over SMOTE')
counter = Counter(y_new)
print(counter)

# Converts the array into a dataframe 
X_new = pd.DataFrame(data=X_new, columns=X_train.columns.to_list())
y_new = pd.DataFrame(data=y_new, columns=y_train.columns.to_list())
print('\nSMOTE dataframe sucessfully created')

### Dummy Classifier

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")

#### Model Training

In [None]:
dummy_clf.fit(X_new, y_new)

#### Model Prediction

In [None]:
predictedValues = dummy_clf.predict(X_test)
print('Sucessfully predicted the values')

#### Cross Validation Score

In [None]:
recallScores, precisionScores, f1Scores = funcCustomCVScore(fncp_X_train=X_new, 
                                                            fncp_y_train=y_new, 
                                                            fncpKFold=10,
                                                            fncpBaseModel=DummyClassifier,
                                                            fncpBaseModelParam={'strategy':'most_frequent'},
                                                            fncpRandomState=123,
                                                            fncpScoreAverage='weighted')

#### Test Model Evaluation

In [None]:
fncpModelEvaluvate(fncpActual=y_test, 
                   fncpPredicted=predictedValues, 
                   fncpBoolHeatMap=False, 
                   fncpMultiClass=True,
                   fncpAverageType='weighted')

### Vanilla Randomforest Classifier

In [None]:
classiRandomForest = RandomForestClassifier()

#### Model Training

In [None]:
classiRandomForest.fit(X_new, y_new)

#### Model Prediction 

In [None]:
predictedValues = classiRandomForest.predict(X_test)
print('Sucessfully predicted the values')

#### Cross Validation Score

In [None]:
recallScores, precisionScores, f1Scores = funcCustomCVScore(fncp_X_train=X_new, 
                                                            fncp_y_train=y_new, 
                                                            fncpKFold=10,
                                                            fncpBaseModel=RandomForestClassifier,
                                                            fncpBaseModelParam=None,
                                                            fncpRandomState=123,
                                                            fncpScoreAverage='weighted')

#### Test Model Evaulation  

In [None]:
fncpModelEvaluvate(fncpActual=y_test, 
                   fncpPredicted=predictedValues, 
                   fncpBoolHeatMap=False, 
                   fncpMultiClass=True,
                   fncpAverageType='weighted')

### Random forest with optimum parameters

#### Grid Search Parameters

In [None]:
print(f'The hyperparameters are \n{random_grid}')

#### Random Search CV Training

In [None]:
start_time = time.time()
model = classiRandomForest
current_time = datetime.datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%d%m%Y_%H%M%S")
modelFileName = PATH+'/Model/cv_SMOTE_model_'+ current_time +'.sav'
print(f'Model training start time : {current_time}\n')
rf_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=100, cv=3, verbose=3, random_state=42, n_jobs=-1)
rf_random.fit(X_new, y_new)

# Saving Model #
pickle.dump(rf_random, open(modelFileName, 'wb'))
smote_loaded_model = pickle.load(open(modelFileName, 'rb'))
print(f'Minutes taken to complete training : {(time.time() - start_time)/60}')

In [None]:
### Loading Model ###
fileName = r'/content/gdrive/My Drive/AIML/DIY_Programs/MLPipeLine/Model/cv_SMOTE_model_04092020_140033.sav'
smote_loaded_model = pickle.load(open(fileName, 'rb'))
print('Sucessfully loaded the model!!!')

In [None]:
for key, val in smote_loaded_model.best_params_.items():
  print(f'The best "{key}" hyperparameter is: {val} ')

In [None]:
### Feature Importance ###
dfSMOTEImportantFeature = funcFeatureImportance(smote_loaded_model, X_train, fncpCV=True)

#### Cross Validation Score

In [None]:
recallScores, precisionScores, f1Scores = funcCustomCVScore(fncp_X_train=X_new, 
                                                            fncp_y_train=y_new, 
                                                            fncpKFold=10,
                                                            fncpBaseModel=RandomForestClassifier, 
                                                            fncpBaseModelParam=smote_loaded_model.best_params_,
                                                            fncpRandomState=123,
                                                            fncpScoreAverage='weighted')

#### Model Prediction

In [None]:
smotePredictedValues = smote_loaded_model.best_estimator_.predict(X_test)
print('Sucessfully predicted the values')

#### Test Model Evaluation

In [None]:
fncpModelEvaluvate(y_test, smotePredictedValues, fncpBoolHeatMap=False, fncpMultiClass=True,fncpAverageType='weighted')

# Model Deployment

In [None]:
UserInput = [[1.0e+00, 0.0e+00, 7.0e+06, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        1.4e+07, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        1.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00,
        0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 1.0e+00,
        2.4e+01, 1.7e+01, 0.0e+00]]

In [None]:
predictedValue = smote_loaded_model.best_estimator_.predict(UserInput)
prdictedClass = label_encoder.inverse_transform(predictedValue)
print(f'The predicted value is "{prdictedClass.item()}"')