Calling Libraries

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datacleaner import autoclean

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [14]:
df = pd.read_csv("cancer_reg.csv")
df.head()

Unnamed: 0,avg_ann_count,avg_deaths_peryear,target_deathrate,incidence_rate,med_income,popest2015,poverty_percent,study_percap,binnedinc,median_age,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,"(61494.5, 125635]",39.3,...,,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",33.0,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
2,102.0,50,174.7,349.7,49348,21026,14.6,47.560164,"(48021.6, 51046.4]",45.0,...,43.5,34.9,42.1,21.1,90.92219,0.739673,0.465898,2.747358,54.444868,3.729488
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,"(42724.4, 45201]",42.8,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,"(48021.6, 51046.4]",48.3,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657


In [8]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avg_ann_count            3047 non-null   float64
 1   avg_deaths_peryear       3047 non-null   int64  
 2   target_deathrate         3047 non-null   float64
 3   incidence_rate           3047 non-null   float64
 4   med_income               3047 non-null   int64  
 5   popest2015               3047 non-null   int64  
 6   poverty_percent          3047 non-null   float64
 7   study_percap             3047 non-null   float64
 8   binnedinc                3047 non-null   object 
 9   median_age               3047 non-null   float64
 10  median_age_male          3047 non-null   float64
 11  median_age_female        3047 non-null   float64
 12  geography                3047 non-null   object 
 13  percent_married          3047 non-null   float64
 14  pctnohs18_24            

Unnamed: 0,avg_ann_count,avg_deaths_peryear,target_deathrate,incidence_rate,med_income,popest2015,poverty_percent,study_percap,binnedinc,median_age,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,"(61494.5, 125635]",39.3,...,,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",33.0,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
2,102.0,50,174.7,349.7,49348,21026,14.6,47.560164,"(48021.6, 51046.4]",45.0,...,43.5,34.9,42.1,21.1,90.92219,0.739673,0.465898,2.747358,54.444868,3.729488
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,"(42724.4, 45201]",42.8,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,"(48021.6, 51046.4]",48.3,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657


Using a New Library for Data Preprocessing

In [12]:
df.isnull().sum()

avg_ann_count                 0
avg_deaths_peryear            0
target_deathrate              0
incidence_rate                0
med_income                    0
popest2015                    0
poverty_percent               0
study_percap                  0
binnedinc                     0
median_age                    0
median_age_male               0
median_age_female             0
geography                     0
percent_married               0
pctnohs18_24                  0
pcths18_24                    0
pctsomecol18_24            2285
pctbachdeg18_24               0
pcths25_over                  0
pctbachdeg25_over             0
pctemployed16_over          152
pctunemployed16_over          0
pctprivatecoverage            0
pctprivatecoveragealone     609
pctempprivcoverage            0
pctpubliccoverage             0
pctpubliccoveragealone        0
pctwhite                      0
pctblack                      0
pctasian                      0
pctotherrace                  0
pctmarri

In [15]:
for col in df.columns:
  if df[col].dtypes == 'object':
    df[col].fillna(df[col].mode()[0], inplace=True)
  else:
    df[col].fillna(df[col].mean(), inplace=True)

In [16]:
df.isnull().sum()

avg_ann_count              0
avg_deaths_peryear         0
target_deathrate           0
incidence_rate             0
med_income                 0
popest2015                 0
poverty_percent            0
study_percap               0
binnedinc                  0
median_age                 0
median_age_male            0
median_age_female          0
geography                  0
percent_married            0
pctnohs18_24               0
pcths18_24                 0
pctsomecol18_24            0
pctbachdeg18_24            0
pcths25_over               0
pctbachdeg25_over          0
pctemployed16_over         0
pctunemployed16_over       0
pctprivatecoverage         0
pctprivatecoveragealone    0
pctempprivcoverage         0
pctpubliccoverage          0
pctpubliccoveragealone     0
pctwhite                   0
pctblack                   0
pctasian                   0
pctotherrace               0
pctmarriedhouseholds       0
birthrate                  0
dtype: int64

In [17]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avg_ann_count            3047 non-null   float64
 1   avg_deaths_peryear       3047 non-null   int64  
 2   target_deathrate         3047 non-null   float64
 3   incidence_rate           3047 non-null   float64
 4   med_income               3047 non-null   int64  
 5   popest2015               3047 non-null   int64  
 6   poverty_percent          3047 non-null   float64
 7   study_percap             3047 non-null   float64
 8   binnedinc                3047 non-null   object 
 9   median_age               3047 non-null   float64
 10  median_age_male          3047 non-null   float64
 11  median_age_female        3047 non-null   float64
 12  geography                3047 non-null   object 
 13  percent_married          3047 non-null   float64
 14  pctnohs18_24            

Unnamed: 0,avg_ann_count,avg_deaths_peryear,target_deathrate,incidence_rate,med_income,popest2015,poverty_percent,study_percap,binnedinc,median_age,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,"(61494.5, 125635]",39.3,...,48.453774,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",33.0,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
2,102.0,50,174.7,349.7,49348,21026,14.6,47.560164,"(48021.6, 51046.4]",45.0,...,43.5,34.9,42.1,21.1,90.92219,0.739673,0.465898,2.747358,54.444868,3.729488
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,"(42724.4, 45201]",42.8,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,"(48021.6, 51046.4]",48.3,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657


In [19]:
df.nunique()
df.value_counts()

avg_ann_count  avg_deaths_peryear  target_deathrate  incidence_rate  med_income  popest2015  poverty_percent  study_percap  binnedinc           median_age  median_age_male  median_age_female  geography                       percent_married  pctnohs18_24  pcths18_24  pctsomecol18_24  pctbachdeg18_24  pcths25_over  pctbachdeg25_over  pctemployed16_over  pctunemployed16_over  pctprivatecoverage  pctprivatecoveragealone  pctempprivcoverage  pctpubliccoverage  pctpubliccoveragealone  pctwhite    pctblack   pctasian   pctotherrace  pctmarriedhouseholds  birthrate
6.0            3                   214.4             373.3           43859       1130        11.7             0.000000      (42724.4, 45201]    51.2        53.0             49.6               Wibaux County, Montana          56.5             18.7          28.0        53.300000        0.0              37.4          16.4               54.7                3.0                   68.1                45.200000                36.2           

In [20]:
cat_col = df.select_dtypes(include='object')
le = LabelEncoder()

for col in cat_col:
  df[col] = le.fit_transform(df[col])


Scaling

In [21]:
scaler =  StandardScaler()
num_col = df.select_dtypes(include='number').columns
df[num_col] = scaler.fit_transform(df[num_col])
df.head()

Unnamed: 0,avg_ann_count,avg_deaths_peryear,target_deathrate,incidence_rate,med_income,popest2015,poverty_percent,study_percap,binnedinc,median_age,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
0,0.558328,0.561518,-0.496057,0.761321,1.232312,0.478696,-0.886102,0.650277,1.218383,-0.131848,...,0.0,0.042734,-0.427608,-0.857336,-0.113862,-0.448196,1.367088,-0.039818,0.245324,0.241011
1,-0.306003,-0.230067,-0.625801,-0.672179,0.088363,-0.180448,0.268698,-0.249816,0.173418,-0.27093,...,0.592879,0.254461,-0.657187,-0.644641,0.340912,-0.56006,0.380201,0.49979,-0.893428,-0.658382
2,-0.35614,-0.269746,-0.142865,-1.806881,0.18979,-0.248055,-0.355519,-0.203646,0.173418,-0.006012,...,-0.549357,-0.66655,0.745793,0.304306,0.444328,-0.575848,-0.301959,0.217176,0.487085,-0.962391
3,-0.12664,0.031811,0.581539,-0.327553,-0.234279,-0.081322,0.034617,0.353585,-0.523226,-0.05458,...,-0.904226,-0.655964,1.153933,0.942391,0.494549,-0.572892,-0.035483,-0.17653,-0.033835,-0.52202
4,-0.387917,-0.31736,-1.234876,-1.799549,0.240214,-0.280592,-0.683232,-0.29346,0.173418,0.06684,...,-0.504998,-0.645377,0.988126,0.566084,0.63861,-0.608154,-0.225352,-0.424035,0.42357,0.582401


In [22]:
x = df.drop(columns=['target_deathrate'])
y = df['target_deathrate']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

# Linear Regression

In [23]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_val)

In [25]:
mse_lr = mean_squared_error(y_val, y_pred)
r2_lr = r2_score(y_val, y_pred)
print(f'MSE: {mse_lr}, R2: {r2_lr}')

MSE: 0.5192434833903561, R2: 0.5217746550908351


# Random Forest

In [28]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=5, random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)

In [30]:
mse_rf = mean_squared_error(y_val, y_pred)
r2_rf = r2_score(y_val, y_pred)
print(f'MSE: {mse_rf}, R2: {r2_rf}')

MSE: 0.47452832808182493, R2: 0.5629574936899153


# Decision Tree

In [31]:
dt = DecisionTreeRegressor(max_depth=5, min_samples_split=5, random_state=42)
dt.fit(x_train, y_train)
y_pred = dt.predict(x_val)

In [32]:
mse_dt = mean_squared_error(y_val, y_pred)
r2_dt = r2_score(y_val, y_pred)
print(f'MSE: {mse_dt}, R2: {r2_dt}')

MSE: 0.6799037878646977, R2: 0.37380586592320497


# LGBM 

In [36]:
lgb = LGBMRegressor(n_estimators=100, max_depth=10, random_state=42)
lgb.fit(x_train, y_train)
y_pred = lgb.predict(x_val)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7593
[LightGBM] [Info] Number of data points in the train set: 2437, number of used features: 32
[LightGBM] [Info] Start training from score -0.002001


In [37]:
mse_lgb = mean_squared_error(y_val, y_pred)
r2_lgb = r2_score(y_val, y_pred)
print(f'MSE: {mse_lgb}, R2: {r2_lgb}')

MSE: 0.3331770336219763, R2: 0.6931426066643606


In [42]:
data = [
  ['LinearRegression', '', mse_lr, r2_lr],
  ['RandomForestRegressor', '', mse_rf, r2_rf],
  ['DecisionTreeRegressor', '', mse_dt, r2_dt],
  ['LGBMRegressor', '', mse_lgb, r2_lgb]
]

headers = ['Model', 'MSE', 'R2-Score']
table = tabulate(data, headers=headers, tablefmt='grid')
print(table)

+-----------------------+---------+----------+------------+
|                       | Model   |      MSE |   R2-Score |
| LinearRegression      |         | 0.519243 |   0.521775 |
+-----------------------+---------+----------+------------+
| RandomForestRegressor |         | 0.474528 |   0.562957 |
+-----------------------+---------+----------+------------+
| DecisionTreeRegressor |         | 0.679904 |   0.373806 |
+-----------------------+---------+----------+------------+
| LGBMRegressor         |         | 0.333177 |   0.693143 |
+-----------------------+---------+----------+------------+
