In [1]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [2]:
#read in incidence csv from github

mortality_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Age_Mortality_Cancer_DB.csv")                         
#get rid of commas in columns
mortality_df = mortality_df.replace(',','', regex=True)
mortality_df.head()     

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019,abbrv,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019
0,4,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,38.8,Marengo County AL,54.0,19766,19525,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0
1,48,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,50+,606.5,Marengo County AL,54.0,19766,19525,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0
2,18,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<65,84.7,Marengo County AL,54.0,19766,19525,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0
3,35,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,65+,963.0,Marengo County AL,54.0,19766,19525,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0
4,9,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,30.2,Jackson County AL,53.0,52195,51988,51828,51621,51626,AL,40201.0,41407.0,42658.0,41929.0,44322.0


In [3]:
#check dtypes
mortality_df.dtypes

average_annual_count                     int64
primary_state_name                      object
cancer                                  object
race_ethnicity                          object
sex                                     object
age                                     object
age_adjusted_rate                       object
county_state                            object
index_of_medical_underservice_score    float64
popestimate2015                          int64
popestimate2016                          int64
popestimate2017                          int64
popestimate2018                          int64
popestimate2019                          int64
abbrv                                   object
median_household_income_2015           float64
median_household_income_2016           float64
median_household_income_2017           float64
median_household_income_2018           float64
median_household_income_2019           float64
dtype: object

In [4]:
#get count of rows
mortality_df.count()

average_annual_count                   8170
primary_state_name                     8170
cancer                                 8170
race_ethnicity                         8170
sex                                    8170
age                                    8170
age_adjusted_rate                      8170
county_state                           8170
index_of_medical_underservice_score    7091
popestimate2015                        8170
popestimate2016                        8170
popestimate2017                        8170
popestimate2018                        8170
popestimate2019                        8170
abbrv                                  8152
median_household_income_2015           8152
median_household_income_2016           8152
median_household_income_2017           8152
median_household_income_2018           8152
median_household_income_2019           8152
dtype: int64

In [5]:
#get count of values for age
mortality_df["age"].value_counts()

50+    2420
65+    2406
<65    2198
<50    1146
Name: age, dtype: int64

In [6]:
#get count of NA values for MUA
mortality_df["index_of_medical_underservice_score"].isna().sum()

1079

In [7]:
#drop rows where MUA index is null
mortality_df = mortality_df[mortality_df["index_of_medical_underservice_score"].isna() == False]

In [8]:
#get count of rows
mortality_df.count()

average_annual_count                   7091
primary_state_name                     7091
cancer                                 7091
race_ethnicity                         7091
sex                                    7091
age                                    7091
age_adjusted_rate                      7091
county_state                           7091
index_of_medical_underservice_score    7091
popestimate2015                        7091
popestimate2016                        7091
popestimate2017                        7091
popestimate2018                        7091
popestimate2019                        7091
abbrv                                  7085
median_household_income_2015           7085
median_household_income_2016           7085
median_household_income_2017           7085
median_household_income_2018           7085
median_household_income_2019           7085
dtype: int64

In [9]:
#keep only columns with 65+ and <65
mortality_df_65 = mortality_df[mortality_df["age"] != "50+"] 
mortality_df_65 = mortality_df_65[mortality_df_65["age"] != "<50"] 
mortality_df_65["age"].value_counts()

65+    2086
<65    1912
Name: age, dtype: int64

In [10]:
mortality_df_65.count()

average_annual_count                   3998
primary_state_name                     3998
cancer                                 3998
race_ethnicity                         3998
sex                                    3998
age                                    3998
age_adjusted_rate                      3998
county_state                           3998
index_of_medical_underservice_score    3998
popestimate2015                        3998
popestimate2016                        3998
popestimate2017                        3998
popestimate2018                        3998
popestimate2019                        3998
abbrv                                  3995
median_household_income_2015           3995
median_household_income_2016           3995
median_household_income_2017           3995
median_household_income_2018           3995
median_household_income_2019           3995
dtype: int64

In [11]:
#create target 
y = mortality_df_65[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
2,84.7
3,963.0
6,80.3
7,986.9
10,69.2


In [12]:
#create feautures variable holding ages a d mua score
X = mortality_df_65[["age", "index_of_medical_underservice_score"]]
X.head()

Unnamed: 0,age,index_of_medical_underservice_score
2,<65,54.0
3,65+,54.0
6,<65,53.0
7,65+,53.0
10,<65,61.0


In [13]:
#create dummy variables for age
X = pd.get_dummies(X, columns=["age"])
X.head()

Unnamed: 0,index_of_medical_underservice_score,age_65+,age_<65
2,54.0,0,1
3,54.0,1,0
6,53.0,0,1
7,53.0,1,0
10,61.0,0,1


In [14]:
#get shape of target array
y.shape

(3998, 1)

In [15]:
#get shape of feautures array
X.shape

(3998, 3)

In [16]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
#create linear regression model using sklearn
model = LinearRegression()

In [18]:
#fit the model with training data
regression_age_MUA_mortality = model.fit(X_train, y_train)

In [19]:
#Create predictions array using linear regression
y_pred = regression_age_MUA_mortality.predict(X_test)
y_pred

array([[926.75081297],
       [926.31179532],
       [ 57.00468088],
       [ 56.41932401],
       [ 59.93146524],
       [ 56.85834167],
       [ 65.3460163 ],
       [929.1410202 ],
       [927.66543309],
       [930.99465029],
       [ 57.1510201 ],
       [ 56.56566323],
       [926.89715219],
       [ 57.59003776],
       [926.01911689],
       [ 57.88271619],
       [ 60.07780446],
       [ 57.1510201 ],
       [ 55.10227105],
       [ 57.00468088],
       [ 56.71200245],
       [ 65.3460163 ],
       [926.82398258],
       [926.53130415],
       [ 65.3460163 ],
       [ 56.66322271],
       [ 56.49249362],
       [926.60447376],
       [ 62.36069626],
       [ 58.90709072],
       [926.89715219],
       [926.26301558],
       [926.60447376],
       [ 56.85834167],
       [926.1654561 ],
       [ 57.1510201 ],
       [929.82393655],
       [930.40929342],
       [ 56.85834167],
       [ 57.1510201 ],
       [927.7751875 ],
       [ 56.56566323],
       [ 56.41932401],
       [ 57

In [20]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9412026496519436

In [21]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [22]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [23]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.946
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                 3.531e+04
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        20:42:13   Log-Likelihood:                -24213.
No. Observations:                3998   AIC:                         4.843e+04
Df Residuals:                    3995   BIC:                         4.845e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
co

In [24]:
#fit the model with training data
#decision tree regression
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [25]:
#predict
y_pred = regressor_DT.predict(X_test)
y_pred

array([ 956.07157895,  934.78762887,   58.51764706,   57.85942029,
         66.75714286,   58.975     ,   47.20277778, 1042.7       ,
       1009.5       ,  950.56666667,   58.85769231,   56.90092593,
        952.69701493,   59.53703704,  942.006875  ,   62.56875   ,
         64.4       ,   58.85769231,   49.15      ,   58.51764706,
         57.1576087 ,   47.20277778,  907.7       ,  922.62666667,
         47.20277778,   42.23333333,   52.5       ,  939.48736842,
         69.6       ,   64.22105263,  952.69701493,  893.4       ,
        939.48736842,   58.975     ,  925.55121951,   58.85769231,
        925.36      , 1016.02857143,   58.975     ,   58.85769231,
        930.46744186,   56.90092593,   57.85942029,   58.85769231,
        930.46744186,   58.975     ,  942.006875  ,   57.85942029,
        818.7       ,   51.56666667,   57.1576087 ,   57.85942029,
         57.85942029,   56.90092593,   65.3       ,   61.24411765,
         52.5       ,  934.78762887,   61.24411765,  934.78762

In [26]:
#calculate r2 score
r2_score(y_test, y_pred)

0.9381360798622791