# Visit Count Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import math
import pickle
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
df_raw = pd.read_csv("df_raw.csv")
df_monthly = pd.read_csv("df_monthly.csv")

# Profit and Unit Forecast per Procedure

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from xgboost import XGBRegressor
import xgboost as xgb
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.utils import timeseries_dataset_from_array
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.utils.vis_utils import plot_model
import sklearn as skl

In [None]:
df_for_ts_units = df_monthly.groupby(['Month_since_Jan2020', 'Month_in_year', 'Procedure'])['Units'].sum().unstack().fillna(0)

In [None]:
df_for_ts_units = df_for_ts_units.reset_index().set_index('Month_since_Jan2020')

In [None]:
df_for_ts_units

In [None]:
ts_train = timeseries_dataset_from_array(df_for_ts_units, df_for_ts_units.drop(columns = ['Month_in_year']), sequence_length=12, batch_size=1,
                                            start_index=0, end_index=24)

In [None]:
model=Sequential()
#model.add(Conv1D(filters=70, kernel_size=1, padding='same', activation='tanh',input_shape=(1, 70)))
#model.add(MaxPooling1D(pool_size=1, padding='same'))
model.add(LSTM(70, activation='tanh', input_shape=(12,70)))
model.add(Dense(69))
model.compile(optimizer='adam', loss='mse')
model.fit(ts_train, epochs=25,verbose=1)
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
test = timeseries_dataset_from_array(df_for_ts_units, None, sequence_length = 12, 
                                          start_index = 12, end_index = None)
df_results = pd.DataFrame(model.predict(test))

In [None]:
df_results.columns = df_for_ts_units.drop(columns = ['Month_in_year']).columns

In [None]:
df_results.set_index(df_for_ts_units['Month_in_year'].loc[23:])
df_results = df_results.apply(abs).apply(round)

In [None]:
df_results

# Patient Profit and Visit Prediction

### Preprocessing

In [4]:
df_raw = df_raw.drop(columns = ['Unnamed: 0'])

In [5]:
df_patient = df_raw.groupby(['Patient_Id', 'Insurance', 'DOB', 'Ethnicity', 'Sex', 'Zip'], as_index=False)['Profit', 'Units'].sum()

  df_patient = df_raw.groupby(['Patient_Id', 'Insurance', 'DOB', 'Ethnicity', 'Sex', 'Zip'], as_index=False)['Profit', 'Units'].sum()


In [6]:
# set Date of Birth of patient equal to number of months after Sept 1st 2002 (results in earliest DoB equal to 1)
df_patient['DOB'] = ((pd.to_datetime(df_patient['DOB'], format='%m/%d/%Y') - pd.to_datetime('9/1/2002'))/np.timedelta64(1, 'M')).round().astype(int)

In [None]:
df_patient

In [7]:
# drop duplicate Patient_Id, as this represents patients who have multiple insurances over the selected time period
# switching insurances is too complex
# df_patient.drop_duplicates(subset=['Patient_Id'], keep=False)
check = dict()
for patient in df_patient['Patient_Id']:
    check[patient] = check.get(patient, 0) + 1
df_patient['Switched_Ins'] = df_patient['Patient_Id'].map(check)
df_patient['Switched_Ins'] = df_patient['Switched_Ins'].map({1:False, 2:True, 3:True, 4:True, 5:True})
df_patient

Unnamed: 0,Patient_Id,Insurance,DOB,Ethnicity,Sex,Zip,Profit,Units,Switched_Ins
0,50041,Molina Healthcare Medicaid,4,Not Hispanic or Latino,Male,29720.0,425.68,9,False
1,50085,Molina Healthcare Medicaid,5,Not Hispanic or Latino,Female,29706.0,174.64,4,False
2,50212,BCBS - PPO Plans,5,Not Hispanic or Latino,Female,29732.0,425.91,6,False
3,50231,Molina Healthcare Medicaid,6,Not Hispanic or Latino,Male,29730.0,303.33,3,False
4,50276,United Healthcare,7,Not Hispanic or Latino,Male,29714.0,162.00,3,False
...,...,...,...,...,...,...,...,...,...
3079,73776,Cigna,233,Unknown,Female,29726.0,416.00,3,False
3080,73786,Healthy Blue Medicaid,235,Unknown,Female,29730.0,336.98,3,False
3081,73788,BCBS SC SHP,235,Unknown,Male,29732.0,161.00,1,False
3082,73791,BCBS - PPO Plans,235,Unknown,Male,29732.0,296.00,2,False


In [None]:
first_ins = df_patient.loc[df_patient['Switched_Ins'] == True].drop_duplicates(subset=['Patient_Id'], keep='first')
second_ins = df_patient.loc[df_patient['Switched_Ins']==True].drop_duplicates(subset=['Patient_Id'], keep='last')

first_ins.describe()

In [None]:
second_ins.describe()

In [None]:
df_patient['Switched_Ins'] = df_patient['Switched_Ins'].map({False:0, True:1})

In [8]:
df_pt_final_total = df_patient.drop_duplicates(subset=['Patient_Id'], keep=False)
df_pt_final_total

Unnamed: 0,Patient_Id,Insurance,DOB,Ethnicity,Sex,Zip,Profit,Units,Switched_Ins
0,50041,Molina Healthcare Medicaid,4,Not Hispanic or Latino,Male,29720.0,425.68,9,False
1,50085,Molina Healthcare Medicaid,5,Not Hispanic or Latino,Female,29706.0,174.64,4,False
2,50212,BCBS - PPO Plans,5,Not Hispanic or Latino,Female,29732.0,425.91,6,False
3,50231,Molina Healthcare Medicaid,6,Not Hispanic or Latino,Male,29730.0,303.33,3,False
4,50276,United Healthcare,7,Not Hispanic or Latino,Male,29714.0,162.00,3,False
...,...,...,...,...,...,...,...,...,...
3079,73776,Cigna,233,Unknown,Female,29726.0,416.00,3,False
3080,73786,Healthy Blue Medicaid,235,Unknown,Female,29730.0,336.98,3,False
3081,73788,BCBS SC SHP,235,Unknown,Male,29732.0,161.00,1,False
3082,73791,BCBS - PPO Plans,235,Unknown,Male,29732.0,296.00,2,False


In [9]:
# reserve small portion of df_pt_final only to be used as final 'unseen' test set, separate from validation 
df_pt_final, df_test = train_test_split(df_pt_final_total, test_size=0.08) 

## Unit Prediction by Patient Demographics

In [None]:
reg_units = XGBRegressor()
train = df_pt_final.drop(columns = ['Patient_Id', 'Profit', 'Switched_Ins'])

#### Mean Encoding Insurance Labels, using Cross-Val to limit data leakage during encoding

In [1]:
y_train = train.Units.values
kf = KFold(n_splits = 5, shuffle = True, random_state = 123)
train_new = train
train_new['Insurance_mean_enc'] = np.zeros(shape=(len(train_new), 1))
train_new['Zip_mean_enc'] = np.zeros(shape=(len(train_new), 1))
for i, j in kf.split(y_train):
    x_train, x_test = train.iloc[i], train.iloc[j]
    means = x_test['Insurance'].map(x_train.groupby('Insurance').Units.mean())
    means2 = x_test['Zip'].map(x_train.groupby('Zip').Units.mean())
    x_test['Insurance_mean_enc'] = means
    x_test['Zip_mean_enc'] = means2
    train_new['Insurance_mean_enc'].iloc[j] = x_test['Insurance_mean_enc']
    train_new['Zip_mean_enc'].iloc[j] = x_test['Zip_mean_enc']
prior = train['Units'].mean()
train_new.fillna(prior, inplace=True)
train_new

NameError: name 'train' is not defined

#### Split finalized train data

In [None]:
x = train_new.drop(columns = ['Insurance', 'Units', 'Zip'])
y = train_new['Units']
dtrain = xgb.DMatrix(x, y)

In [None]:
x_trn, x_tsn, y_trn, y_tsn = train_test_split(x, y, test_size = 0.2, random_state = 42)

#### Perform Random Search over certain hyperparameters to aid model optimization

In [None]:
params = {
    'learning_rate'    : [0.01, 0.05, 0.1, 0.15],
    'max_depth'        : [7, 10, 20, 35, 50, 75, 100],
    'min_child_weight' : [1, 3, 5, 10, 25],
    'gamma'            : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'colsample_bytree': [0.2, 0.3, 0.5, 0.7]
    
}

In [None]:
rand_search = RandomizedSearchCV(reg_units, param_distributions=params, n_iter=5, scoring='neg_mean_squared_error', n_jobs=-1, cv=5, verbose=3)

In [None]:
rand_search.fit(x, y)

#### Evaluate results of Random Search

In [None]:
print(rand_search.best_estimator_)

In [None]:
rand_search.cv_results_

In [None]:
#opt_params = rand_search.best_params_
opt_params = {'colsample_bytree': 0.5,
 'gamma': 10,
 'max_depth': 20,
 'min_child_weight': 15,
 'n_estimators': 90}

#### Train model using optimal parameters

In [None]:
model_cv = xgb.cv(opt_params, dtrain, nfold=5, metrics='rmse', as_pandas=True, early_stopping_rounds=3, shuffle=True)

In [None]:
model_cv

In [None]:
trn = xgb.DMatrix(x_trn, label=y_trn)
tsn = xgb.DMatrix(x_tsn, label=y_tsn)

In [None]:
trainer = xgb.train(opt_params, dtrain, evals=[(trn, 'trn'), (tsn, 'tsn')], early_stopping_rounds=3)

In [None]:
with open('unit_model.pickle', 'wb') as file:
    pickle.dump(trainer, file)

In [None]:
with open('unit_model.pickle', 'rb') as file:
    trainer = pickle.load(file)

In [None]:
preds = trainer.predict(tsn)

#### Evaluate Model Performance

In [None]:
r2 = r2_score(preds, y_tsn)

print(r2)

In [None]:
print(math.sqrt(mean_squared_error(preds, y_tsn)))

In [None]:
xgb.plot_importance(trainer)

## Profit Prediction by Patient Demographics

In [10]:
reg_pro = XGBRegressor()
trainp = df_pt_final.drop(columns = ['Patient_Id', 'Units', 'Switched_Ins'])

#### Mean Encoding Insurance Labels, using Cross-Val to limit data leakage during encoding

In [11]:
y_trainp = trainp.Profit.values
kfp = KFold(n_splits = 5, shuffle = True, random_state = 123)
train_newp = trainp
train_newp['Insurance_mean_enc'] = np.zeros(shape=(len(train_newp), 1))
train_newp['Zip_mean_enc'] = np.zeros(shape=(len(train_newp), 1))
#train_newp['Eth_mean_enc'] = np.zeros(shape=(len(train_newp), 1))
#train_newp['Sex_mean_enc'] = np.zeros(shape=(len(train_newp), 1))
for i, j in kfp.split(y_trainp):
    x_trainp, x_testp = trainp.iloc[i], trainp.iloc[j]
    means = x_testp['Insurance'].map(x_trainp.groupby('Insurance').Profit.mean())
    means2 = x_testp['Zip'].map(x_trainp.groupby('Zip').Profit.mean())
    #means3 = x_testp['Ethnicity'].map(x_trainp.groupby('Ethnicity').Profit.mean())
    #means4 = x_testp['Sex'].map(x_trainp.groupby('Sex').Profit.mean())
    x_testp['Insurance_mean_enc'] = means
    x_testp['Zip_mean_enc'] = means2
    #x_testp['Eth_mean_enc'] = means3
    #x_testp['Sex_mean_enc'] = means4
    train_newp['Insurance_mean_enc'].iloc[j] = x_testp['Insurance_mean_enc']
    train_newp['Zip_mean_enc'].iloc[j] = x_testp['Zip_mean_enc']
    #train_newp['Eth_mean_enc'].iloc[j] = x_testp['Eth_mean_enc']
    #train_newp['Sex_mean_enc'].iloc[j] = x_testp['Sex_mean_enc']
    
prior = trainp['Profit'].mean()
train_newp.fillna(prior, inplace=True)
train_newp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_testp['Insurance_mean_enc'] = means
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_testp['Zip_mean_enc'] = means2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_newp['Insurance_mean_enc'].iloc[j] = x_testp['Insurance_mean_enc']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://panda

Unnamed: 0,Insurance,DOB,Ethnicity,Sex,Zip,Profit,Insurance_mean_enc,Zip_mean_enc
2687,Healthy Blue Medicaid,218,Not Hispanic or Latino,Male,29745.0,1373.03,823.826383,792.180194
1006,BCBS SC SHP,96,Not Hispanic or Latino,Male,29732.0,936.21,880.201086,731.647742
574,Medicaid of SC,66,Not Hispanic or Latino,Male,29715.0,246.02,344.353333,771.344118
1196,Molina Healthcare Medicaid,110,Not Hispanic or Latino,Male,29745.0,565.12,541.799421,776.645043
91,BCBS - PPO Plans,19,Not Hispanic or Latino,Female,29732.0,311.00,780.282827,759.001160
...,...,...,...,...,...,...,...,...
691,Absolute Total Care Medicaid,77,Not Hispanic or Latino,Male,29730.0,147.97,554.345217,651.736043
1533,BCBS - PPO Plans,136,Not Hispanic or Latino,Male,29732.0,490.22,824.487493,731.647742
3051,BCBS Federal Employee Program,233,Not Hispanic or Latino,Male,29732.0,306.00,728.962000,723.058076
2911,BCBS SC SHP,227,Not Hispanic or Latino,Female,29706.0,2357.26,880.201086,518.147447


In [12]:
le = LabelEncoder()
train_newp['Sex'] = le.fit_transform(train_newp['Sex'])
train_newp['Ethnicity'] = le.fit_transform(train_newp['Ethnicity'])

#### Split finalized train data

In [13]:
xp = train_newp.drop(columns = ['Insurance', 'Profit', 'Zip'])
yp = train_newp['Profit']
dtrainp = xgb.DMatrix(xp, yp)

#### Perform Grid Search over certain hyperparameters to aid model optimization

In [14]:
params = {
    'max_depth'        : range(15, 30, 5),
    'min_child_weight' : range(15, 30, 3),
    'gamma'            : [0.0, 0.5, 1, 10],
    'colsample_bytree' : [0.3, 0.5, 0.7],
    'n_estimators'     : range(50, 130, 20)
    
}

In [15]:
gridp = GridSearchCV(reg_pro, param_grid=params, n_jobs=-1, scoring='neg_mean_squared_error', cv=5, verbose=3)

In [16]:
gridp.fit(xp, yp)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, m...
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=100,
                                    n_jobs=None, num_parallel_tree=None,
                                    predictor=None, random_state=N

#### Evaluate results of Random Search

In [17]:
print(gridp.best_estimator_)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.7, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=10, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=20, max_leaves=None,
             min_child_weight=27, missing=nan, monotone_constraints=None,
             n_estimators=50, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)


In [18]:
pd.DataFrame(gridp.cv_results_).sort_values(by=['rank_test_score'], ascending = True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_max_depth,param_min_child_weight,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
696,0.240218,0.010605,0.005149,0.003842,0.7,10,20,27,50,"{'colsample_bytree': 0.7, 'gamma': 10, 'max_de...",-185978.727237,-154419.782204,-144684.044422,-181113.701362,-159023.573091,-165043.965663,15875.424035,1
576,0.245640,0.010978,0.007437,0.001107,0.7,0.5,20,27,50,"{'colsample_bytree': 0.7, 'gamma': 0.5, 'max_d...",-185978.727237,-154419.782204,-144684.044422,-181113.701362,-159024.244291,-165044.099903,15875.373129,2
516,0.270052,0.008336,0.004740,0.003319,0.7,0.0,20,27,50,"{'colsample_bytree': 0.7, 'gamma': 0.0, 'max_d...",-185978.727237,-154419.782204,-144684.044422,-181113.701362,-159024.244291,-165044.099903,15875.373129,2
636,0.240250,0.009413,0.009042,0.000608,0.7,1,20,27,50,"{'colsample_bytree': 0.7, 'gamma': 1, 'max_dep...",-185978.727237,-154419.782204,-144684.044422,-181113.701362,-159024.244291,-165044.099903,15875.373129,2
676,0.221884,0.007940,0.006331,0.003199,0.7,10,15,27,50,"{'colsample_bytree': 0.7, 'gamma': 10, 'max_de...",-189720.901011,-150664.509738,-147289.414565,-185313.167855,-156965.420660,-165990.682766,17902.882763,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.238469,0.007039,0.007883,0.000663,0.3,10,20,15,110,"{'colsample_bytree': 0.3, 'gamma': 10, 'max_de...",-223913.929911,-210458.554908,-171948.498631,-203420.999607,-195953.627382,-201139.122088,17261.126949,716
103,0.181315,0.009364,0.006383,0.000489,0.3,0.5,25,15,110,"{'colsample_bytree': 0.3, 'gamma': 0.5, 'max_d...",-224192.619795,-210682.373307,-172569.217701,-203499.152317,-196086.206489,-201405.913922,17143.643990,717
163,0.243945,0.007944,0.009974,0.003512,0.3,1,25,15,110,"{'colsample_bytree': 0.3, 'gamma': 1, 'max_dep...",-224192.619795,-210682.373307,-172569.217701,-203499.152317,-196086.206489,-201405.913922,17143.643990,717
43,0.173336,0.005691,0.005785,0.000399,0.3,0.0,25,15,110,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'max_d...",-224192.619795,-210682.718209,-172569.217701,-203499.152317,-196086.206489,-201405.982902,17143.681316,719


In [19]:
opt_paramsp = gridp.best_params_
opt_paramsp

{'colsample_bytree': 0.7,
 'gamma': 10,
 'max_depth': 20,
 'min_child_weight': 27,
 'n_estimators': 50}

#### Train model using optimal parameters

In [20]:
model_cvp = xgb.cv(opt_paramsp, dtrainp, nfold=5, num_boost_round = 90, metrics='rmse', as_pandas=True, early_stopping_rounds=2, shuffle=True)

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.



In [21]:
model_cvp

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,750.613515,3.995605,756.35405,25.049534
1,611.513859,20.32726,624.022838,30.302176
2,533.319947,13.968037,550.481248,18.090321
3,489.188552,19.536933,510.267842,28.334035
4,441.107492,11.534585,469.22777,19.829365
5,423.079587,14.116994,458.550883,21.426751
6,400.354197,7.061645,440.481623,14.377627
7,389.875292,7.608931,433.09618,10.112787
8,380.71615,5.798573,426.634023,10.047662
9,371.882045,2.928229,423.337938,10.240984


In [22]:
trnp = xgb.DMatrix(xp, label=yp)

In [23]:
testp = df_test.drop(columns = ['Patient_Id', 'Units', 'Switched_Ins'])
y_testp = testp.Profit.values
kfpt = KFold(n_splits = 5, shuffle = True, random_state = 123)
test_newp = testp
test_newp['Insurance_mean_enc'] = np.zeros(shape=(len(test_newp), 1))
test_newp['Zip_mean_enc'] = np.zeros(shape=(len(test_newp), 1))
#test_newp['Eth_mean_enc'] = np.zeros(shape=(len(test_newp), 1))
#test_newp['Sex_mean_enc'] = np.zeros(shape=(len(test_newp), 1))
for i, j in kfpt.split(y_testp):
    x_trainp, x_testp = testp.iloc[i], testp.iloc[j]
    means = x_testp['Insurance'].map(x_trainp.groupby('Insurance').Profit.mean())
    means2 = x_testp['Zip'].map(x_trainp.groupby('Zip').Profit.mean())
    #means3 = x_testp['Ethnicity'].map(x_trainp.groupby('Ethnicity').Profit.mean())
    #means4 = x_testp['Sex'].map(x_trainp.groupby('Sex').Profit.mean())
    x_testp['Insurance_mean_enc'] = means
    x_testp['Zip_mean_enc'] = means2
    #x_testp['Eth_mean_enc'] = means3
    #x_testp['Sex_mean_enc'] = means4
    test_newp['Insurance_mean_enc'].iloc[j] = x_testp['Insurance_mean_enc']
    test_newp['Zip_mean_enc'].iloc[j] = x_testp['Zip_mean_enc']
    #test_newp['Eth_mean_enc'].iloc[j] = x_testp['Eth_mean_enc']
    #test_newp['Sex_mean_enc'].iloc[j] = x_testp['Sex_mean_enc']
    
prior = testp['Profit'].mean()
test_newp.fillna(prior, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_testp['Insurance_mean_enc'] = means
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_testp['Zip_mean_enc'] = means2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_newp['Insurance_mean_enc'].iloc[j] = x_testp['Insurance_mean_enc']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas

In [24]:
let = LabelEncoder()
test_newp['Sex'] = let.fit_transform(test_newp['Sex'])
test_newp['Ethnicity'] = let.fit_transform(test_newp['Ethnicity'])

In [25]:
test_newp

Unnamed: 0,Insurance,DOB,Ethnicity,Sex,Zip,Profit,Insurance_mean_enc,Zip_mean_enc
195,Molina Healthcare Medicaid,34,0,0,29745.0,140.20,564.898600,755.808750
1838,Molina Healthcare Medicaid,148,2,1,29732.0,546.77,532.059565,824.479608
2894,BCBS - PPO Plans,226,2,1,28625.0,2164.44,885.366061,681.916158
1718,Molina Healthcare Medicaid,151,2,1,29730.0,226.34,579.955577,720.656667
137,BCBS - PPO Plans,26,2,0,29704.0,1201.96,755.669474,361.656667
...,...,...,...,...,...,...,...,...
215,BCBS SC SHP,36,2,0,29732.0,468.41,1039.749444,801.253585
2457,BCBS SC SHP,206,2,1,29732.0,3654.51,702.408667,731.494118
2793,Absolute Total Care Medicaid,120,2,1,29732.0,226.87,701.387778,798.092727
2705,BCBS - PPO Plans,220,2,1,29710.0,1416.77,885.366061,543.899091


In [26]:
tsnp = xgb.DMatrix(test_newp.drop(columns=['Insurance', 'Zip', 'Profit']), label=test_newp['Profit'])

In [27]:
trainerp = xgb.train(opt_paramsp, dtrainp, evals=[(trnp, 'trnp'), (tsnp, 'tsnp')], early_stopping_rounds=3)

Parameters: { "n_estimators" } are not used.

[0]	trnp-rmse:749.38122	tsnp-rmse:751.23394
[1]	trnp-rmse:600.00059	tsnp-rmse:629.14480
[2]	trnp-rmse:505.53558	tsnp-rmse:553.75414
[3]	trnp-rmse:464.98221	tsnp-rmse:531.00532
[4]	trnp-rmse:422.40235	tsnp-rmse:513.93748
[5]	trnp-rmse:396.80901	tsnp-rmse:499.48163
[6]	trnp-rmse:382.07576	tsnp-rmse:489.60494
[7]	trnp-rmse:373.83310	tsnp-rmse:487.50758
[8]	trnp-rmse:362.16158	tsnp-rmse:483.71852
[9]	trnp-rmse:359.24262	tsnp-rmse:482.49519


In [28]:
with open('profit_model.pickle', 'wb') as file:
    pickle.dump(trainerp, file)

In [None]:
with open('profit_model.pickle', 'rb') as file:
    trainerp = pickle.load(file)

In [29]:
predsp = trainerp.predict(tsnp)

#### Evaluate Model Performance

In [32]:
r2 = r2_score(predsp, test_newp['Profit'])

print(r2)

-0.12931218807845735


In [35]:
print(math.sqrt(mean_squared_error(predsp, test_newp['Profit'])))

482.495185683436


In [None]:
xgb.plot_importance(trainerp)