In [1]:
#import dependencies
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import psycopg2
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
#create engine and connect to AWS RDS
db_string = f"postgresql://postgres:{db_password}@capstone-db.cutxgn80t57o.us-west-1.rds.amazonaws.com"
engine = create_engine(db_string)
# read and check merged death table
df = pd.read_sql('deaths_merged_full', con = engine)
df

Unnamed: 0,FIPS,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,E_AGE65,...,Gender Equality,Hopefulness,Income Per Capita,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic
0,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,...,77.063492,91.163142,26168.0,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952
1,1009,644.830460,57645,24222,20600,8220,909,22656,7861,10233,...,64.585114,79.492703,21033.0,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000
2,1013,776.838201,20025,10026,6708,4640,567,20430,2141,3806,...,64.769089,83.523765,19011.0,78.563680,76.109761,76.623924,69.058104,79.956648,67.920284,72.773953
3,1015,605.867251,115098,53682,45033,20819,4628,24706,12620,19386,...,69.015332,83.365608,22231.0,79.439032,79.955121,77.918741,54.063568,76.745724,67.456150,68.292794
4,1017,596.560643,33826,16981,13516,5531,773,22827,4383,6409,...,69.433309,85.371517,21532.0,76.995358,78.156771,75.891100,67.343775,79.128558,66.397785,69.554441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053,48229,4570.523160,4098,1562,900,951,101,14190,1263,639,...,67.196038,55.568966,14776.0,76.720396,79.603081,73.986415,70.917126,79.605796,75.878105,71.008448
3054,48131,1793.476183,11355,5592,3511,2751,482,17864,2386,2025,...,66.171080,77.899678,19853.0,79.125428,78.895880,76.629575,60.576045,73.670302,64.571017,68.007770
3055,48505,998.411980,14369,6388,4405,5609,621,17228,3226,1999,...,67.037410,86.586509,16007.0,79.355639,79.572483,74.378252,77.443239,76.386871,74.001471,73.609838
3056,48507,1297.406535,12131,4344,3509,4150,421,13350,2719,1665,...,65.804541,88.785822,13393.0,78.392216,76.024682,75.848196,76.967659,77.303576,70.010162,71.121990


In [3]:
#set index to FIPS
df = df.set_index(df['FIPS'])
df= df.drop(columns = ['FIPS'])

In [4]:
# use regex to split svi columns into distinct categories
cols = df.columns.to_list()
col_series = pd.Series(cols)
pct_str = r'^[ERS]P+.'
pct_form = col_series.str.contains(pct_str)
pct_col = col_series[pct_form].to_list()
flag_str = r'^F+.'
flag_form = col_series.str.contains(flag_str)
flag_col = col_series[flag_form].to_list()
val_str = r'^E_+.'
val_form = col_series.str.contains(val_str)
val_col = col_series[val_form].to_list()
non_svi = col_series[~pct_form & ~flag_form & ~val_form].to_list()

In [5]:
# filter cases_df to only have svi estimated percentage columns
# deaths_df = df.drop(columns = flag_col)
deaths_df = df.drop(columns = val_col)
deaths_df.columns.to_list()
deaths_df = deaths_df.merge(df['E_TOTPOP'], how = 'left', left_index=True, right_index=True)


In [6]:
# drop counties with 0 cases 
## these seem to be errors- mostly in Utah, some counties with large populations
zeros = deaths_df.loc[deaths_df['first_yr_deaths']==0]
deaths_df = deaths_df.drop(index = zeros.index)
deaths_df

Unnamed: 0_level_0,AREA_SQMI,EP_POV,EP_UNEMP,EP_PCI,EP_NOHSDP,EP_AGE65,EP_AGE17,EP_DISABL,EP_SNGPNT,EP_MINRTY,...,Hopefulness,Income Per Capita,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,E_TOTPOP
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,594.443459,15.4,4.2,29372.0,11.3,14.6,24.2,19.3,7.5,25.0,...,91.163142,26168.0,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,55200
1009,644.830460,14.4,4.1,22656.0,19.8,17.8,23.4,14.2,7.0,12.9,...,79.492703,21033.0,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,57645
1013,776.838201,23.5,6.7,20430.0,15.4,19.0,22.8,17.7,10.5,48.1,...,83.523765,19011.0,78.563680,76.109761,76.623924,69.058104,79.956648,67.920284,72.773953,20025
1015,605.867251,18.6,8.8,24706.0,15.9,16.8,21.9,20.8,10.4,27.5,...,83.365608,22231.0,79.439032,79.955121,77.918741,54.063568,76.745724,67.456150,68.292794,115098
1017,596.560643,16.6,5.0,22827.0,18.6,18.9,20.7,16.7,9.7,44.2,...,85.371517,21532.0,76.995358,78.156771,75.891100,67.343775,79.128558,66.397785,69.554441,33826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48229,4570.523160,28.2,8.3,14190.0,46.1,15.6,23.9,27.2,8.0,82.4,...,55.568966,14776.0,76.720396,79.603081,73.986415,70.917126,79.605796,75.878105,71.008448,4098
48131,1793.476183,25.6,10.6,17864.0,32.9,17.8,26.1,26.2,15.8,90.9,...,77.899678,19853.0,79.125428,78.895880,76.629575,60.576045,73.670302,64.571017,68.007770,11355
48505,998.411980,39.5,11.0,17228.0,40.0,13.9,33.6,17.3,17.1,95.7,...,86.586509,16007.0,79.355639,79.572483,74.378252,77.443239,76.386871,74.001471,73.609838,14369
48507,1297.406535,34.8,8.4,13350.0,38.0,13.7,29.5,23.3,16.1,95.0,...,88.785822,13393.0,78.392216,76.024682,75.848196,76.967659,77.303576,70.010162,71.121990,12131


In [7]:
#create variable for case% for each counties population
deaths_df['death_pct'] = deaths_df['first_yr_deaths']/deaths_df['E_TOTPOP']*100
deaths_df['death_pct'].head()

FIPS
1001    0.179348
1009    0.227253
1013    0.329588
1015    0.264992
1017    0.345888
Name: death_pct, dtype: float64

In [8]:
deaths_df['death_pct'].describe()

count    3000.000000
mean        0.191211
std         0.106248
min         0.001401
25%         0.115764
50%         0.176377
75%         0.243764
max         0.788566
Name: death_pct, dtype: float64

In [9]:
# bin and cut the case_pct column into 2 classifications
q = deaths_df['death_pct'].quantile(.9)
bins = [0, q , 40]
labels = ['low','high']
deaths_df['death_class'] = pd.cut(deaths_df['death_pct'], bins, labels = labels)
deaths_df['death_class']

FIPS
1001      low
1009      low
1013     high
1015      low
1017     high
         ... 
48229     low
48131    high
48505     low
48507    high
48247     low
Name: death_class, Length: 3000, dtype: category
Categories (2, object): ['low' < 'high']

In [10]:
deaths_df['death_class'].value_counts()

low     2700
high     300
Name: death_class, dtype: int64

In [11]:
#drop unneeded columns
deaths_df = deaths_df.drop('death_pct', axis =1)
deaths_df

Unnamed: 0_level_0,AREA_SQMI,EP_POV,EP_UNEMP,EP_PCI,EP_NOHSDP,EP_AGE65,EP_AGE17,EP_DISABL,EP_SNGPNT,EP_MINRTY,...,Income Per Capita,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,E_TOTPOP,death_class
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,594.443459,15.4,4.2,29372.0,11.3,14.6,24.2,19.3,7.5,25.0,...,26168.0,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,55200,low
1009,644.830460,14.4,4.1,22656.0,19.8,17.8,23.4,14.2,7.0,12.9,...,21033.0,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,57645,low
1013,776.838201,23.5,6.7,20430.0,15.4,19.0,22.8,17.7,10.5,48.1,...,19011.0,78.563680,76.109761,76.623924,69.058104,79.956648,67.920284,72.773953,20025,high
1015,605.867251,18.6,8.8,24706.0,15.9,16.8,21.9,20.8,10.4,27.5,...,22231.0,79.439032,79.955121,77.918741,54.063568,76.745724,67.456150,68.292794,115098,low
1017,596.560643,16.6,5.0,22827.0,18.6,18.9,20.7,16.7,9.7,44.2,...,21532.0,76.995358,78.156771,75.891100,67.343775,79.128558,66.397785,69.554441,33826,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48229,4570.523160,28.2,8.3,14190.0,46.1,15.6,23.9,27.2,8.0,82.4,...,14776.0,76.720396,79.603081,73.986415,70.917126,79.605796,75.878105,71.008448,4098,low
48131,1793.476183,25.6,10.6,17864.0,32.9,17.8,26.1,26.2,15.8,90.9,...,19853.0,79.125428,78.895880,76.629575,60.576045,73.670302,64.571017,68.007770,11355,high
48505,998.411980,39.5,11.0,17228.0,40.0,13.9,33.6,17.3,17.1,95.7,...,16007.0,79.355639,79.572483,74.378252,77.443239,76.386871,74.001471,73.609838,14369,low
48507,1297.406535,34.8,8.4,13350.0,38.0,13.7,29.5,23.3,16.1,95.0,...,13393.0,78.392216,76.024682,75.848196,76.967659,77.303576,70.010162,71.121990,12131,high


In [12]:
#turn case % classifications into binary 
deaths_df = pd.get_dummies(deaths_df, columns = ['death_class'])
deaths_df

Unnamed: 0_level_0,AREA_SQMI,EP_POV,EP_UNEMP,EP_PCI,EP_NOHSDP,EP_AGE65,EP_AGE17,EP_DISABL,EP_SNGPNT,EP_MINRTY,...,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,E_TOTPOP,death_class_low,death_class_high
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,594.443459,15.4,4.2,29372.0,11.3,14.6,24.2,19.3,7.5,25.0,...,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,55200,1,0
1009,644.830460,14.4,4.1,22656.0,19.8,17.8,23.4,14.2,7.0,12.9,...,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,57645,1,0
1013,776.838201,23.5,6.7,20430.0,15.4,19.0,22.8,17.7,10.5,48.1,...,78.563680,76.109761,76.623924,69.058104,79.956648,67.920284,72.773953,20025,0,1
1015,605.867251,18.6,8.8,24706.0,15.9,16.8,21.9,20.8,10.4,27.5,...,79.439032,79.955121,77.918741,54.063568,76.745724,67.456150,68.292794,115098,1,0
1017,596.560643,16.6,5.0,22827.0,18.6,18.9,20.7,16.7,9.7,44.2,...,76.995358,78.156771,75.891100,67.343775,79.128558,66.397785,69.554441,33826,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48229,4570.523160,28.2,8.3,14190.0,46.1,15.6,23.9,27.2,8.0,82.4,...,76.720396,79.603081,73.986415,70.917126,79.605796,75.878105,71.008448,4098,1,0
48131,1793.476183,25.6,10.6,17864.0,32.9,17.8,26.1,26.2,15.8,90.9,...,79.125428,78.895880,76.629575,60.576045,73.670302,64.571017,68.007770,11355,0,1
48505,998.411980,39.5,11.0,17228.0,40.0,13.9,33.6,17.3,17.1,95.7,...,79.355639,79.572483,74.378252,77.443239,76.386871,74.001471,73.609838,14369,1,0
48507,1297.406535,34.8,8.4,13350.0,38.0,13.7,29.5,23.3,16.1,95.0,...,78.392216,76.024682,75.848196,76.967659,77.303576,70.010162,71.121990,12131,0,1


In [13]:
X = deaths_df.drop(columns= ['death_class_low','death_class_high','first_yr_deaths']).values
y= deaths_df['death_class_high'].values

In [14]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = .7, random_state=78)

In [15]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
rf_model = RandomForestClassifier(n_estimators=256, random_state=78) 

In [17]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [18]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [19]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [20]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,809,8
Actual 1,76,7


In [21]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
model = model.fit(X_train_scaled, y_train)

In [22]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [23]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,724,93
Actual 1,56,27


In [24]:
# We can sort the features by their importance.
X = deaths_df.drop(['death_class_low','death_class_high','first_yr_deaths'], axis=1).columns
importance_df = pd.DataFrame(sorted(zip(model.feature_importances_, X), reverse=True))
importance_df

Unnamed: 0,0,1
0,0.083465,E_TOTPOP
1,0.048931,SPL_THEMES
2,0.043139,Selflessness
3,0.038936,Conflict Awareness
4,0.038521,Religiosity
...,...,...
79,0.000000,EP_CROWD
80,0.000000,EP_AGE65
81,0.000000,EPL_MOBILE
82,0.000000,EPL_DISABL


In [25]:
impact = importance_df.loc[importance_df[0]>.01]
impact_col = impact[1].to_list()
impact

Unnamed: 0,0,1
0,0.083465,E_TOTPOP
1,0.048931,SPL_THEMES
2,0.043139,Selflessness
3,0.038936,Conflict Awareness
4,0.038521,Religiosity
5,0.038216,EPL_SNGPNT
6,0.037307,Agreeableness
7,0.033918,Collectivism
8,0.032345,EPL_AGE17
9,0.030923,Neuroticism


In [26]:
impact_df = deaths_df[impact_col]
impact_df.columns.to_list()

['E_TOTPOP',
 'SPL_THEMES',
 'Selflessness',
 'Conflict Awareness',
 'Religiosity',
 'EPL_SNGPNT',
 'Agreeableness',
 'Collectivism',
 'EPL_AGE17',
 'Neuroticism',
 'Income Per Capita',
 'EPL_GROUPQ',
 'dem_pct',
 'Work Ethic',
 'EPL_PCI',
 'Entrepreneurship',
 'EP_PCI',
 'Extraversion',
 'EP_AGE17',
 'SPL_THEME1',
 'F_THEME2',
 'EPL_MINRTY',
 'Risk Taking',
 'EP_UNINSUR',
 'RPL_THEMES',
 'EPL_AGE65',
 'SPL_THEME2',
 'RPL_THEME3',
 'Employment Rate',
 'SPL_THEME3',
 'EP_MOBILE',
 'Empathy',
 'Tolerance',
 'Conscientiousness',
 'EP_GROUPQ']

In [27]:
X = impact_df.values
y= deaths_df['death_class_high'].values

In [28]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = .7, random_state=78)
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=128,
    learning_rate=learning_rate,
    max_features=5,
    max_depth=3,
    random_state=0)
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
           classifier.score(
           X_train_scaled,
           y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
           classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.933
Accuracy score (validation): 0.911
Learning rate:  0.1
Accuracy score (training): 0.951
Accuracy score (validation): 0.907
Learning rate:  0.25
Accuracy score (training): 0.987
Accuracy score (validation): 0.897
Learning rate:  0.5
Accuracy score (training): 1.000
Accuracy score (validation): 0.877
Learning rate:  0.75
Accuracy score (training): 1.000
Accuracy score (validation): 0.894
Learning rate:  1
Accuracy score (training): 1.000
Accuracy score (validation): 0.879


In [30]:
classifier = GradientBoostingClassifier(n_estimators=200,
   learning_rate=0.25, max_features='auto', max_depth=10, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [31]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.9077777777777778


In [32]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,803,14
Actual 1,69,14


In [33]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       817
           1       0.50      0.17      0.25        83

    accuracy                           0.91       900
   macro avg       0.71      0.58      0.60       900
weighted avg       0.88      0.91      0.89       900

