# Baseline Model Pipeline

**Can we improve on the baseline scores with added features, imputation completed, and MinMax scaling?**

**`p1_tag` ~  `rank` + `employee_count` (ordinal) +  `total_funding_usd` + `age` + `continent` (nominal) + `industry` (nominal)**

In [1]:
# Add 'graph' environment to PATH
import sys
sys.path.append('/home/ski/anaconda3/envs/graph/lib/python3.8/site-packages')

# User defined functions
import base_methods
from importlib import reload
from base_methods import load_the_csvs

# Import data analysis packages
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# ML
import category_encoders as ce
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model  import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

## Loading data

In [2]:
# Store path to notebook
PWD = !pwd
PWD = PWD[0]

# Set global paths to data folders
!mkdir {PWD}/files/output/
print()
INPUT = PWD + '/files/csv/'
OUTPUT = PWD + '/files/output/'

# Load
df = load_the_csvs(loc=OUTPUT, data=['baseline_impute_complete'], verbose=True)

# Remove columns not used in model
df_simple = df.drop(['country_code','continent_code','category_groups_list','uuid'], axis=1)

print('\nEnding Dataframe Columns:\n\n{}'.format(df_simple.columns.to_list()))
print('\nDataframe shape:', df_simple.shape)

mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/output/’: File exists

/HOME/SKI/DESKTOP/CRUNCHBASE-P1-MACHINE-LEARNING/FILES/OUTPUT/BASELINE_IMPUTE_COMPLETE.CSV
BASELINE_IMPUTE_COMPLETE shape: (1131325, 61)
BASELINE_IMPUTE_COMPLETE columns: ['uuid', 'p1_tag', 'country_code', 'category_groups_list', 'continent_code', 'employee_count', 'total_funding_usd', 'rank', 'age', 'ind_1', 'ind_2', 'ind_3', 'ind_4', 'ind_5', 'ind_6', 'ind_7', 'ind_8', 'ind_9', 'ind_10', 'ind_11', 'ind_12', 'ind_13', 'ind_14', 'ind_15', 'ind_16', 'ind_17', 'ind_18', 'ind_19', 'ind_20', 'ind_21', 'ind_22', 'ind_23', 'ind_24', 'ind_25', 'ind_26', 'ind_27', 'ind_28', 'ind_29', 'ind_30', 'ind_31', 'ind_32', 'ind_33', 'ind_34', 'ind_35', 'ind_36', 'ind_37', 'ind_38', 'ind_39', 'ind_40', 'ind_41', 'ind_42', 'ind_43', 'ind_44', 'ind_45', 'ind_46', 'cont_AF', 'cont_AS', 'cont_EU', 'cont_NA', 'cont_OC', 'cont_SA']


Ending Dataframe Columns:

['p1_tag', 'employee_count', 'total_fundin

In [3]:
# Select equal sample of non-Pledge 1% organizations
df_p1 = df_simple[df_simple['p1_tag']==1]
df_notp1 = df_simple[df_simple['p1_tag']==0].sample(n=df_p1.shape[0], replace=False)
df_model = pd.concat([df_p1, df_notp1]).reset_index(drop=True)

# Create variable for each feature type: categorical and numerical
numeric_features = df_model.select_dtypes(include=['int8', 'int16', 'int32', 'int64', 'float16', 'float32','float64']).drop(['p1_tag'], axis=1).columns
categorical_features = df_model.select_dtypes(include=['object']).columns
print('Numeric features:', numeric_features.to_list())
print('\nCategorical features:', categorical_features.to_list())

X = df_model.drop('p1_tag', axis=1)
y = df_model['p1_tag']
y = preprocessing.LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print('Training data shape:', X_train.shape)
print('Train label shape:', y_train.shape)
print('Test data shape:',  X_test.shape)
print('Test label shape:', y_test.shape)

Numeric features: ['employee_count', 'total_funding_usd', 'rank', 'age', 'ind_1', 'ind_2', 'ind_3', 'ind_4', 'ind_5', 'ind_6', 'ind_7', 'ind_8', 'ind_9', 'ind_10', 'ind_11', 'ind_12', 'ind_13', 'ind_14', 'ind_15', 'ind_16', 'ind_17', 'ind_18', 'ind_19', 'ind_20', 'ind_21', 'ind_22', 'ind_23', 'ind_24', 'ind_25', 'ind_26', 'ind_27', 'ind_28', 'ind_29', 'ind_30', 'ind_31', 'ind_32', 'ind_33', 'ind_34', 'ind_35', 'ind_36', 'ind_37', 'ind_38', 'ind_39', 'ind_40', 'ind_41', 'ind_42', 'ind_43', 'ind_44', 'ind_45', 'ind_46', 'cont_AF', 'cont_AS', 'cont_EU', 'cont_NA', 'cont_OC', 'cont_SA']

Categorical features: []
Training data shape: (10964, 56)
Train label shape: (10964,)
Test data shape: (4700, 56)
Test label shape: (4700,)


#### Run through pipeline to determine best categorical feature encoder

From: <a href='https://towardsdatascience.com/an-easier-way-to-encode-categorical-features-d840ff6b3900'>An Easier Way to Encode Categorical Features</a>

In [4]:
results = {}
classifier_list = []
LRR = LogisticRegression()
KNN = KNeighborsClassifier()
BNB = BernoulliNB()
classifier_list.append(('LogisticRegression', LRR, {'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000]}))
classifier_list.append(('KNeighborsClassifier', KNN, {'classifier__n_neighbors': np.arange(1,29,2)}))
classifier_list.append(('BernoulliNB', BNB, {'classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}))

encoder_list = [ce.backward_difference.BackwardDifferenceEncoder]#, 
#                 ce.basen.BaseNEncoder,
#                 ce.binary.BinaryEncoder,
#                 ce.cat_boost.CatBoostEncoder,
#                 ce.hashing.HashingEncoder,
#                 ce.helmert.HelmertEncoder,
#                 ce.james_stein.JamesSteinEncoder,
#                 ce.one_hot.OneHotEncoder,
#                 ce.leave_one_out.LeaveOneOutEncoder,
#                 ce.m_estimate.MEstimateEncoder,
#                 ce.ordinal.OrdinalEncoder,
#                 ce.polynomial.PolynomialEncoder,
#                 ce.sum_coding.SumEncoder,
#                 ce.target_encoder.TargetEncoder,
#                 ce.woe.WOEEncoder]

for label, classifier, params in classifier_list:
    results[label] = {}
    print('{}'.format(label))

    numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
#     categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median', fill_value='missing')),
#                                               ('woe', encoder())])

    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])
                                                   #('cat', categorical_transformer, categorical_features)])
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])
    
    if params != {}:
        try:
            search = GridSearchCV(pipe, params, n_jobs=-1)
            search.fit(X_train, y_train)
            print('Best parameter (CV score={:.3f}): {}'.format(search.best_score_, search.best_params_))
            model = search.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = f1_score(y_test, y_pred)
            print('Best score: {:.4f}\n'.format(score))
            results[label]['score'] = score
            results[label]['best_params'] = search.best_params_
        except:
            print('Something went wrong w/ GridSearch or pipeline fitting.')
    else:
        try:
            model = pipe.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = f1_score(y_test, y_pred)
            print('Score: {:.4f}\n'.format(score))
            results[label]['score'] = score
        except:
            print('Something went wrong with pipeline fitting')

LogisticRegression
Best parameter (CV score=0.729): {'classifier__C': 1000}
Best score: 0.7376

KNeighborsClassifier
Best parameter (CV score=0.702): {'classifier__n_neighbors': 21}
Best score: 0.6988

BernoulliNB
Best parameter (CV score=0.691): {'classifier__alpha': 10.0}
Best score: 0.6892



Our imputed, scaled, and modified model makes minimal improvement over the baseline, but considering we were ignoring missing values previously, this is still a good sign that we are headed in the right direction.

Baseline scores:

    Averaged Logistic Regression f1 Score: 0.7263
    Averaged K-Nearest Neighbour f1 score: 0.6997
    Averaged Naive Bayes f1 score: 0.6854