In [1]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [2]:
#read in incidence csv from github

mortality_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Race_Mortality_Cancer_DB.csv")                         
#get rid of commas in columns
mortality_df = mortality_df.replace(',','', regex=True)
mortality_df.head()     

Unnamed: 0,average_annual_count,recent_trend,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,five_year_trend_rates,county_state,...,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019
0,181,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,213.2,213.2,Walker County AL,...,64931,64533,63895,63669,63521,37305.0,39511.0,41925.0,45434.0,45991.0
1,180,stable,Alabama,All Cancer Sites,White Non-Hispanic,All Sexes,All Ages,213.9,213.9,Walker County AL,...,64931,64533,63895,63669,63521,37305.0,39511.0,41925.0,45434.0,45991.0
2,10,stable,Alabama,All Cancer Sites,Black (includes Hispanic),All Sexes,All Ages,281.1,281.1,Walker County AL,...,64931,64533,63895,63669,63521,37305.0,39511.0,41925.0,45434.0,45991.0
3,32,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,211.2,211.2,Crenshaw County AL,...,13850,13916,13858,13825,13772,36759.0,37374.0,38419.0,39057.0,43309.0
4,32,stable,Alabama,All Cancer Sites,White Non-Hispanic,All Sexes,All Ages,214.0,214.0,Crenshaw County AL,...,13850,13916,13858,13825,13772,36759.0,37374.0,38419.0,39057.0,43309.0


In [3]:
#check dtypes
mortality_df.dtypes

average_annual_count                     int64
recent_trend                            object
primary_state_name                      object
cancer                                  object
race_ethnicity                          object
sex                                     object
age                                     object
age_adjusted_rate                       object
five_year_trend_rates                   object
county_state                            object
index_of_medical_underservice_score    float64
popestimate2015                          int64
popestimate2016                          int64
popestimate2017                          int64
popestimate2018                          int64
popestimate2019                          int64
median_household_income_2015           float64
median_household_income_2016           float64
median_household_income_2017           float64
median_household_income_2018           float64
median_household_income_2019           float64
dtype: object

In [4]:
#get count of rows
mortality_df.count()

average_annual_count                   7341
recent_trend                           7341
primary_state_name                     7341
cancer                                 7341
race_ethnicity                         7341
sex                                    7341
age                                    7341
age_adjusted_rate                      7341
five_year_trend_rates                  7341
county_state                           7341
index_of_medical_underservice_score    6439
popestimate2015                        7341
popestimate2016                        7341
popestimate2017                        7341
popestimate2018                        7341
popestimate2019                        7341
median_household_income_2015           7328
median_household_income_2016           7328
median_household_income_2017           7328
median_household_income_2018           7328
median_household_income_2019           7328
dtype: int64

In [5]:
#get count of values for race
mortality_df["race_ethnicity"].value_counts()

White (includes Hispanic)                       2414
  White Non-Hispanic                            2311
Black (includes Hispanic)                       1012
Hispanic (any race)                              572
  White Hispanic                                 560
Asian / Pacific Islander (includes Hispanic)     286
Amer. Indian / AK Native (includes Hispanic)     186
Name: race_ethnicity, dtype: int64

In [6]:
#clean race values
mortality_df['race_ethnicity'] = mortality_df['race_ethnicity'].str.strip()

In [7]:
#get count of values for race
mortality_df["race_ethnicity"].value_counts()

White (includes Hispanic)                       2414
White Non-Hispanic                              2311
Black (includes Hispanic)                       1012
Hispanic (any race)                              572
White Hispanic                                   560
Asian / Pacific Islander (includes Hispanic)     286
Amer. Indian / AK Native (includes Hispanic)     186
Name: race_ethnicity, dtype: int64

In [8]:
#drop rows where race equals white non-hispanic and race equals white hispanic to avoid overlap
mortality_df = mortality_df[mortality_df["race_ethnicity"] != "White Non-Hispanic"]
mortality_df = mortality_df[mortality_df["race_ethnicity"] != "White Hispanic"]

In [9]:
#get count of values for race
mortality_df["race_ethnicity"].value_counts()

White (includes Hispanic)                       2414
Black (includes Hispanic)                       1012
Hispanic (any race)                              572
Asian / Pacific Islander (includes Hispanic)     286
Amer. Indian / AK Native (includes Hispanic)     186
Name: race_ethnicity, dtype: int64

In [10]:
#get count of NA values for MUA
mortality_df["index_of_medical_underservice_score"].isna().sum()

506

In [11]:
#drop rows where MUA index is null
mortality_df = mortality_df[mortality_df["index_of_medical_underservice_score"].isna() == False]

In [12]:
#get count of rows
mortality_df.count()

average_annual_count                   3964
recent_trend                           3964
primary_state_name                     3964
cancer                                 3964
race_ethnicity                         3964
sex                                    3964
age                                    3964
age_adjusted_rate                      3964
five_year_trend_rates                  3964
county_state                           3964
index_of_medical_underservice_score    3964
popestimate2015                        3964
popestimate2016                        3964
popestimate2017                        3964
popestimate2018                        3964
popestimate2019                        3964
median_household_income_2015           3962
median_household_income_2016           3962
median_household_income_2017           3962
median_household_income_2018           3962
median_household_income_2019           3962
dtype: int64

In [13]:
#create target 
y = mortality_df[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
0,213.2
2,281.1
3,211.2
5,175.2
6,201.7


In [14]:
#create feautures variable holding ages and mua score
X = mortality_df[["race_ethnicity", "index_of_medical_underservice_score"]]
X.head()

Unnamed: 0,race_ethnicity,index_of_medical_underservice_score
0,White (includes Hispanic),61.0
2,Black (includes Hispanic),61.0
3,White (includes Hispanic),58.0
5,Black (includes Hispanic),58.0
6,White (includes Hispanic),46.0


In [15]:
#create dummy variables for race
X = pd.get_dummies(X, columns=["race_ethnicity"])
X.head()

Unnamed: 0,index_of_medical_underservice_score,race_ethnicity_Amer. Indian / AK Native (includes Hispanic),race_ethnicity_Asian / Pacific Islander (includes Hispanic),race_ethnicity_Black (includes Hispanic),race_ethnicity_Hispanic (any race),race_ethnicity_White (includes Hispanic)
0,61.0,0,0,0,0,1
2,61.0,0,0,1,0,0
3,58.0,0,0,0,0,1
5,58.0,0,0,1,0,0
6,46.0,0,0,0,0,1


In [16]:
#get shape of target array
y.shape

(3964, 1)

In [17]:
#get shape of feautures array
X.shape

(3964, 6)

In [18]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
#create linear regression model using sklearn
model = LinearRegression()

In [20]:
#fit the model with training data
regression_race_MUA_mortality = model.fit(X_train, y_train)

In [21]:
#Create predictions array using linear regression
y_pred = regression_race_MUA_mortality.predict(X_test)
y_pred

array([[183.86354931],
       [165.80910309],
       [168.24498049],
       [184.05038936],
       [110.44318052],
       [165.93730716],
       [110.82779274],
       [158.04035257],
       [110.74232336],
       [165.96579696],
       [184.03448808],
       [165.77705207],
       [ 90.65438759],
       [158.46769949],
       [166.13266575],
       [ 91.78990939],
       [110.61411929],
       [165.85183778],
       [166.10824593],
       [165.80910309],
       [166.32191938],
       [165.72363371],
       [ 90.95353043],
       [184.11995746],
       [110.40044583],
       [110.44318052],
       [184.17434707],
       [184.20542684],
       [110.91936708],
       [166.27918469],
       [165.89457247],
       [165.72363371],
       [184.20542684],
       [183.906284  ],
       [165.83047043],
       [184.07722277],
       [166.63530712],
       [165.68089901],
       [165.03987865],
       [165.63816432],
       [167.17661321],
       [184.07722277],
       [165.89457247],
       [158

In [22]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.38614192176001216

In [23]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [24]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [25]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.384
Model:                            OLS   Adj. R-squared:                  0.383
Method:                 Least Squares   F-statistic:                     494.0
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        22:32:47   Log-Likelihood:                -19772.
No. Observations:                3964   AIC:                         3.956e+04
Df Residuals:                    3958   BIC:                         3.959e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

In [26]:
#fit the model with training data
#decision tree regression
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [27]:
#predict
y_pred = regressor_DT.predict(X_test)
y_pred

array([191.92982456, 169.60957447, 151.62916667, 194.83333333,
       110.41153846, 161.08596491,  93.26666667, 147.94      ,
        99.5375    , 172.1       , 194.30588235, 160.05      ,
        95.05      , 192.1       , 199.        ,  69.        ,
       148.66      , 168.27391304, 161.93783784, 169.60957447,
       162.92083333, 169.08387097,  89.18      , 201.71428571,
       103.88333333, 110.41153846, 220.1       , 166.32      ,
       116.31538462, 173.34814815, 163.92142857, 169.08387097,
       166.32      , 172.91162791, 154.84444444, 178.35833333,
       171.24545455, 167.47372881, 168.65      , 168.82066667,
       183.55      , 178.35833333, 163.92142857, 117.4       ,
       160.73636364, 194.44285714, 169.88070175, 115.37      ,
       190.42      , 141.7       , 167.47372881, 166.77699115,
       188.06666667, 130.5       , 151.775     , 164.2       ,
       168.85384615, 146.15      , 120.9       , 170.57435897,
       182.44545455, 184.35454545,  82.54285714, 154.84

In [28]:
#calculate r2 score
r2_score(y_test, y_pred)

0.29783709079357734