In [1]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [2]:
#read in race incidence csv from github

incidence_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Race_Incidence_Cancer_DB.csv")                         
#get rid of commas in columns
incidence_df = incidence_df.replace(',','', regex=True)
incidence_df.head()     

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019,abbrv,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019
0,30,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,681.2,Lowndes County AL,54.0,10350,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
1,30,Alabama,All Cancer Sites,White Non-Hispanic,All Sexes,All Ages,694.0,Lowndes County AL,54.0,10350,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
2,51,Alabama,All Cancer Sites,Black (includes Hispanic),All Sexes,All Ages,570.1,Lowndes County AL,54.0,10350,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
3,80,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,558.3,Crenshaw County AL,58.0,13850,13916,13858,13825,13772,AL,36759.0,37374.0,38419.0,39057.0,43309.0
4,80,Alabama,All Cancer Sites,White Non-Hispanic,All Sexes,All Ages,565.3,Crenshaw County AL,58.0,13850,13916,13858,13825,13772,AL,36759.0,37374.0,38419.0,39057.0,43309.0


In [3]:
#get count of rows
incidence_df.count()

average_annual_count                   9160
primary_state_name                     9160
cancer                                 9160
race_ethnicity                         9160
sex                                    9160
age                                    9160
age_adjusted_rate                      9160
county_state                           9160
index_of_medical_underservice_score    7931
popestimate2015                        9160
popestimate2016                        9160
popestimate2017                        9160
popestimate2018                        9160
popestimate2019                        9160
abbrv                                  9146
median_household_income_2015           9146
median_household_income_2016           9146
median_household_income_2017           9146
median_household_income_2018           9146
median_household_income_2019           9146
dtype: int64

In [4]:
#get count of values for race
incidence_df["race_ethnicity"].value_counts()

White (includes Hispanic)                       2643
  White Non-Hispanic                            2335
Black (includes Hispanic)                       1308
Hispanic (any race)                             1009
  White Hispanic                                 912
Asian / Pacific Islander (includes Hispanic)     525
Amer. Indian / AK Native (includes Hispanic)     428
Name: race_ethnicity, dtype: int64

In [5]:
#clean race values
incidence_df['race_ethnicity'] = incidence_df['race_ethnicity'].str.strip()

In [6]:
#get count of values for race
incidence_df["race_ethnicity"].value_counts()

White (includes Hispanic)                       2643
White Non-Hispanic                              2335
Black (includes Hispanic)                       1308
Hispanic (any race)                             1009
White Hispanic                                   912
Asian / Pacific Islander (includes Hispanic)     525
Amer. Indian / AK Native (includes Hispanic)     428
Name: race_ethnicity, dtype: int64

In [7]:
#drop rows where race equals white non-hispanic and race equals white hispanic to avoid overlap
incidence_df = incidence_df[incidence_df["race_ethnicity"] != "White Non-Hispanic"]
incidence_df = incidence_df[incidence_df["race_ethnicity"] != "White Hispanic"]

In [8]:
#get count of values for race
incidence_df["race_ethnicity"].value_counts()

White (includes Hispanic)                       2643
Black (includes Hispanic)                       1308
Hispanic (any race)                             1009
Asian / Pacific Islander (includes Hispanic)     525
Amer. Indian / AK Native (includes Hispanic)     428
Name: race_ethnicity, dtype: int64

In [9]:
#create column for 5-year average median income 
incidence_df["5_year_avg_income"] = (incidence_df["median_household_income_2015"]+incidence_df["median_household_income_2016"]+incidence_df["median_household_income_2017"]+incidence_df["median_household_income_2018"]+incidence_df["median_household_income_2019"])/5
incidence_df.head()

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,...,popestimate2017,popestimate2018,popestimate2019,abbrv,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019,5_year_avg_income
0,30,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,681.2,Lowndes County AL,54.0,10350,...,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0,32832.2
2,51,Alabama,All Cancer Sites,Black (includes Hispanic),All Sexes,All Ages,570.1,Lowndes County AL,54.0,10350,...,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0,32832.2
3,80,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,558.3,Crenshaw County AL,58.0,13850,...,13858,13825,13772,AL,36759.0,37374.0,38419.0,39057.0,43309.0,38983.6
5,18,Alabama,All Cancer Sites,Black (includes Hispanic),All Sexes,All Ages,442.0,Crenshaw County AL,58.0,13850,...,13858,13825,13772,AL,36759.0,37374.0,38419.0,39057.0,43309.0,38983.6
6,15,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,542.5,Greene County AL,56.0,8508,...,8310,8209,8111,AL,25398.0,26559.0,28108.0,28524.0,28699.0,27457.6


In [10]:
#get count of NA values for avg income
incidence_df["5_year_avg_income"].isna().sum()

8

In [11]:
#drop rows where avg income is null
incidence_df = incidence_df[incidence_df["5_year_avg_income"].isna() == False]

In [12]:
#get count of rows
incidence_df.count()

average_annual_count                   5905
primary_state_name                     5905
cancer                                 5905
race_ethnicity                         5905
sex                                    5905
age                                    5905
age_adjusted_rate                      5905
county_state                           5905
index_of_medical_underservice_score    5162
popestimate2015                        5905
popestimate2016                        5905
popestimate2017                        5905
popestimate2018                        5905
popestimate2019                        5905
abbrv                                  5905
median_household_income_2015           5905
median_household_income_2016           5905
median_household_income_2017           5905
median_household_income_2018           5905
median_household_income_2019           5905
5_year_avg_income                      5905
dtype: int64

In [13]:
#create target 
y = incidence_df[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
0,681.2
2,570.1
3,558.3
5,442.0
6,542.5


In [14]:
#create feautures variable holding ages
X = incidence_df[["race_ethnicity", "5_year_avg_income"]]
X.head()

Unnamed: 0,race_ethnicity,5_year_avg_income
0,White (includes Hispanic),32832.2
2,Black (includes Hispanic),32832.2
3,White (includes Hispanic),38983.6
5,Black (includes Hispanic),38983.6
6,White (includes Hispanic),27457.6


In [15]:
#create dummy variables for age
X = pd.get_dummies(X, columns=["race_ethnicity"])
X.head()

Unnamed: 0,5_year_avg_income,race_ethnicity_Amer. Indian / AK Native (includes Hispanic),race_ethnicity_Asian / Pacific Islander (includes Hispanic),race_ethnicity_Black (includes Hispanic),race_ethnicity_Hispanic (any race),race_ethnicity_White (includes Hispanic)
0,32832.2,0,0,0,0,1
2,32832.2,0,0,1,0,0
3,38983.6,0,0,0,0,1
5,38983.6,0,0,1,0,0
6,27457.6,0,0,0,0,1


In [16]:
#get shape of target array
y.shape

(5905, 1)

In [17]:
#get shape of feautures array
X.shape

(5905, 6)

In [18]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
#create linear regression model using sklearn
model = LinearRegression()

In [20]:
#fit the model
regression_race_inc_incidence = model.fit(X_train ,y_train)

In [21]:
#Create predictions array using linear regression
y_pred = regression_race_inc_incidence.predict(X_test)
y_pred

array([[442.47074568],
       [452.85833815],
       [448.7110197 ],
       ...,
       [448.85233471],
       [453.88593974],
       [275.71217102]])

In [22]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.2525197916534646

In [23]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [24]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [25]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.214
Model:                            OLS   Adj. R-squared:                  0.213
Method:                 Least Squares   F-statistic:                     321.3
Date:                Thu, 02 Jun 2022   Prob (F-statistic):          3.29e-305
Time:                        18:26:55   Log-Likelihood:                -36186.
No. Observations:                5905   AIC:                         7.238e+04
Df Residuals:                    5899   BIC:                         7.242e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

In [26]:
##decision tree model
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [27]:
#fit the model with training data
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [28]:
#predict
y_pred = regressor_DT.predict(X)
y_pred

array([623.3, 570.1, 558.3, ..., 381.9, 300.9, 269.1])

In [29]:

#calculate r2 score
r2_score(y, y_pred)

0.5294418592527776