In [23]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [24]:
#read in incidence csv from github

incidence_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Age_Incidence_Cancer_DB.csv")                         
#get rid of commas in columns
incidence_df = incidence_df.replace(',','', regex=True)
incidence_df.head()     

Unnamed: 0,average_annual_count,recent_trend,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,five_year_trend_rates,county_state,...,popestimate2016,popestimate2017,popestimate2018,popestimate2019,abbrv,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019
0,9,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,167.8,2.3,Lowndes County AL,...,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
1,72,rising,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,50+,1738.4,2.0,Lowndes County AL,...,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
2,37,rising,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<65,339.4,2.4,Lowndes County AL,...,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
3,44,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,65+,2416.0,1.8,Lowndes County AL,...,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
4,9,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,145.2,2.0,Conecuh County AL,...,12500,12431,12292,12067,AL,29981.0,29758.0,30796.0,32613.0,41539.0


In [27]:
#check dtypes
incidence_df.dtypes

average_annual_count                     int64
recent_trend                            object
primary_state_name                      object
cancer                                  object
race_ethnicity                          object
sex                                     object
age                                     object
age_adjusted_rate                       object
five_year_trend_rates                   object
county_state                            object
index_of_medical_underservice_score    float64
popestimate2015                          int64
popestimate2016                          int64
popestimate2017                          int64
popestimate2018                          int64
popestimate2019                          int64
abbrv                                   object
median_household_income_2015           float64
median_household_income_2016           float64
median_household_income_2017           float64
median_household_income_2018           float64
median_househ

In [28]:
#get count of rows
incidence_df.count()

average_annual_count                   10229
recent_trend                           10229
primary_state_name                     10229
cancer                                 10229
race_ethnicity                         10229
sex                                    10229
age                                    10229
age_adjusted_rate                      10229
five_year_trend_rates                  10229
county_state                           10229
index_of_medical_underservice_score     8793
popestimate2015                        10229
popestimate2016                        10229
popestimate2017                        10229
popestimate2018                        10229
popestimate2019                        10229
abbrv                                  10210
median_household_income_2015           10210
median_household_income_2016           10210
median_household_income_2017           10210
median_household_income_2018           10210
median_household_income_2019           10210
dtype: int

In [29]:
#get count of values for age
incidence_df["age"].value_counts()

50+    2643
65+    2628
<65    2604
<50    2354
Name: age, dtype: int64

In [30]:
#create column for 5-year average median income 
incidence_df["5_year_avg_income"] = (incidence_df["median_household_income_2015"]+incidence_df["median_household_income_2016"]+incidence_df["median_household_income_2017"]+incidence_df["median_household_income_2018"]+incidence_df["median_household_income_2019"])/5
incidence_df.head()

Unnamed: 0,average_annual_count,recent_trend,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,five_year_trend_rates,county_state,...,popestimate2017,popestimate2018,popestimate2019,abbrv,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019,5_year_avg_income
0,9,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,167.8,2.3,Lowndes County AL,...,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0,32832.2
1,72,rising,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,50+,1738.4,2.0,Lowndes County AL,...,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0,32832.2
2,37,rising,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<65,339.4,2.4,Lowndes County AL,...,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0,32832.2
3,44,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,65+,2416.0,1.8,Lowndes County AL,...,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0,32832.2
4,9,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,145.2,2.0,Conecuh County AL,...,12431,12292,12067,AL,29981.0,29758.0,30796.0,32613.0,41539.0,32937.4


In [31]:
#get count of NA values for avg income
incidence_df["5_year_avg_income"].isna().sum()

19

In [32]:
#drop rows where avg income is null
incidence_df = incidence_df[incidence_df["5_year_avg_income"].isna() == False]

In [33]:
#get count of rows
incidence_df.count()

average_annual_count                   10210
recent_trend                           10210
primary_state_name                     10210
cancer                                 10210
race_ethnicity                         10210
sex                                    10210
age                                    10210
age_adjusted_rate                      10210
five_year_trend_rates                  10210
county_state                           10210
index_of_medical_underservice_score     8786
popestimate2015                        10210
popestimate2016                        10210
popestimate2017                        10210
popestimate2018                        10210
popestimate2019                        10210
abbrv                                  10210
median_household_income_2015           10210
median_household_income_2016           10210
median_household_income_2017           10210
median_household_income_2018           10210
median_household_income_2019           10210
5_year_avg

In [34]:
#keep only columns with 65+ and <65
incidence_df_65 = incidence_df[incidence_df["age"] != "50+"] 
incidence_df_65 = incidence_df_65[incidence_df_65["age"] != "<50"] 
incidence_df_65["age"].value_counts()

65+    2623
<65    2599
Name: age, dtype: int64

In [35]:
incidence_df_65.count()

average_annual_count                   5222
recent_trend                           5222
primary_state_name                     5222
cancer                                 5222
race_ethnicity                         5222
sex                                    5222
age                                    5222
age_adjusted_rate                      5222
five_year_trend_rates                  5222
county_state                           5222
index_of_medical_underservice_score    4498
popestimate2015                        5222
popestimate2016                        5222
popestimate2017                        5222
popestimate2018                        5222
popestimate2019                        5222
abbrv                                  5222
median_household_income_2015           5222
median_household_income_2016           5222
median_household_income_2017           5222
median_household_income_2018           5222
median_household_income_2019           5222
5_year_avg_income               

In [36]:
#create target 
y = incidence_df_65[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
2,339.4
3,2416.0
6,276.9
7,1659.8
10,270.3


In [46]:
#create feautures variable holding ages and income 
X = incidence_df_65[["age", "5_year_avg_income"]]
X.head()

Unnamed: 0,age,5_year_avg_income
2,<65,32832.2
3,65+,32832.2
6,<65,32937.4
7,65+,32937.4
10,<65,27457.6


In [47]:
#create dummy variables for age
X = pd.get_dummies(X, columns=["age"])
X.head()

Unnamed: 0,5_year_avg_income,age_65+,age_<65
2,32832.2,0,1
3,32832.2,1,0
6,32937.4,0,1
7,32937.4,1,0
10,27457.6,0,1


In [48]:
#get shape of target array
y.shape

(5222, 1)

In [49]:
#get shape of feautures array
X.shape

(5222, 3)

In [50]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [51]:
#create linear regression model using sklearn
model = LinearRegression()

In [52]:
#fit the model with training data
regression_age_inc_incidence = model.fit(X_train, y_train)

In [53]:
#Create predictions array using linear regression
y_pred = regression_age_inc_incidence.predict(X_test)
y_pred

array([[1972.26562244],
       [ 250.10359769],
       [ 245.62819454],
       ...,
       [ 260.71867008],
       [ 241.83195265],
       [1966.33368155]])

In [54]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9532117568969551

In [55]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [56]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [58]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.955
Model:                            OLS   Adj. R-squared:                  0.955
Method:                 Least Squares   F-statistic:                 5.512e+04
Date:                Thu, 02 Jun 2022   Prob (F-statistic):               0.00
Time:                        11:39:33   Log-Likelihood:                -34765.
No. Observations:                5222   AIC:                         6.954e+04
Df Residuals:                    5219   BIC:                         6.956e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               710.1237      6.90

In [59]:
#fit the model with training data
#decision tree regression
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [60]:
#predict
y_pred = regressor_DT.predict(X_test)
y_pred

array([1883.5,  229.1,  197.8, ...,  221.3,  238.8, 2185.9])

In [61]:
#calculate r2 score
r2_score(y_test, y_pred)

0.9137466146317845