In [7]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [8]:
#read in incidence csv from github

mortality_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Sex_Mortality_Cancer_DB.csv")                         
#get rid of commas in columns
mortality_df = mortality_df.replace(',','', regex=True)
mortality_df.head()     

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019
0,18,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,296.3,Wilcox County AL,51.0,10896,10844,10691,10599,10373,23014.0,24216.0,25700.0,25385.0,30998.0
1,13,Alabama,All Cancer Sites,All Races (includes Hispanic),Females,All Ages,153.2,Wilcox County AL,51.0,10896,10844,10691,10599,10373,23014.0,24216.0,25700.0,25385.0,30998.0
2,32,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,290.7,Macon County AL,28.0,19296,19060,18793,18321,18068,29522.0,30681.0,30849.0,32495.0,34281.0
3,22,Alabama,All Cancer Sites,All Races (includes Hispanic),Females,All Ages,146.1,Macon County AL,28.0,19296,19060,18793,18321,18068,29522.0,30681.0,30849.0,32495.0,34281.0
4,17,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,283.8,Lowndes County AL,54.0,10350,10248,10097,9974,9726,31117.0,32011.0,33130.0,33973.0,33930.0


In [9]:
#check dtypes
mortality_df.dtypes

average_annual_count                     int64
primary_state_name                      object
cancer                                  object
race_ethnicity                          object
sex                                     object
age                                     object
age_adjusted_rate                      float64
county_state                            object
index_of_medical_underservice_score    float64
popestimate2015                          int64
popestimate2016                          int64
popestimate2017                          int64
popestimate2018                          int64
popestimate2019                          int64
median_household_income_2015           float64
median_household_income_2016           float64
median_household_income_2017           float64
median_household_income_2018           float64
median_household_income_2019           float64
dtype: object

In [10]:
#get count of rows
mortality_df.count()

average_annual_count                   4691
primary_state_name                     4691
cancer                                 4691
race_ethnicity                         4691
sex                                    4691
age                                    4691
age_adjusted_rate                      4691
county_state                           4691
index_of_medical_underservice_score    4069
popestimate2015                        4691
popestimate2016                        4691
popestimate2017                        4691
popestimate2018                        4691
popestimate2019                        4691
median_household_income_2015           4682
median_household_income_2016           4682
median_household_income_2017           4682
median_household_income_2018           4682
median_household_income_2019           4682
dtype: int64

In [11]:
#get count of values for sex
mortality_df["sex"].value_counts()

Males      2368
Females    2323
Name: sex, dtype: int64

In [13]:
#create column for 5-year average median income 
mortality_df["5_year_avg_income"] = (mortality_df["median_household_income_2015"]+mortality_df["median_household_income_2016"]+mortality_df["median_household_income_2017"]+mortality_df["median_household_income_2018"]+mortality_df["median_household_income_2019"])/5
mortality_df.head()

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019,5_year_avg_income
0,18,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,296.3,Wilcox County AL,51.0,10896,10844,10691,10599,10373,23014.0,24216.0,25700.0,25385.0,30998.0,25862.6
1,13,Alabama,All Cancer Sites,All Races (includes Hispanic),Females,All Ages,153.2,Wilcox County AL,51.0,10896,10844,10691,10599,10373,23014.0,24216.0,25700.0,25385.0,30998.0,25862.6
2,32,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,290.7,Macon County AL,28.0,19296,19060,18793,18321,18068,29522.0,30681.0,30849.0,32495.0,34281.0,31565.6
3,22,Alabama,All Cancer Sites,All Races (includes Hispanic),Females,All Ages,146.1,Macon County AL,28.0,19296,19060,18793,18321,18068,29522.0,30681.0,30849.0,32495.0,34281.0,31565.6
4,17,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,283.8,Lowndes County AL,54.0,10350,10248,10097,9974,9726,31117.0,32011.0,33130.0,33973.0,33930.0,32832.2


In [14]:
#drop rows where avg income is null
mortality_df = mortality_df[mortality_df["5_year_avg_income"].isna() == False]

In [15]:
#get count of rows
mortality_df.count()

average_annual_count                   4682
primary_state_name                     4682
cancer                                 4682
race_ethnicity                         4682
sex                                    4682
age                                    4682
age_adjusted_rate                      4682
county_state                           4682
index_of_medical_underservice_score    4066
popestimate2015                        4682
popestimate2016                        4682
popestimate2017                        4682
popestimate2018                        4682
popestimate2019                        4682
median_household_income_2015           4682
median_household_income_2016           4682
median_household_income_2017           4682
median_household_income_2018           4682
median_household_income_2019           4682
5_year_avg_income                      4682
dtype: int64

In [16]:
#create target 
y = mortality_df[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
0,296.3
1,153.2
2,290.7
3,146.1
4,283.8


In [17]:
#create feautures variable holding sexes and mua score
X = mortality_df[["sex", "5_year_avg_income"]]
X.head()

Unnamed: 0,sex,5_year_avg_income
0,Males,25862.6
1,Females,25862.6
2,Males,31565.6
3,Females,31565.6
4,Males,32832.2


In [18]:
#create dummy variables for sex
X = pd.get_dummies(X, columns=["sex"])
X.head()

Unnamed: 0,5_year_avg_income,sex_Females,sex_Males
0,25862.6,0,1
1,25862.6,1,0
2,31565.6,0,1
3,31565.6,1,0
4,32832.2,0,1


In [19]:
#get shape of target array
y.shape

(4682, 1)

In [20]:
#get shape of feautures array
X.shape

(4682, 3)

In [21]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
#create linear regression model using sklearn
model = LinearRegression()

In [23]:
#fit the model with training data
regression_sex_inc_mortality = model.fit(X_train, y_train)

In [24]:
#Create predictions array using linear regression
y_pred = regression_sex_inc_mortality.predict(X_test)
y_pred

array([[201.79073008],
       [201.04000531],
       [205.9373269 ],
       ...,
       [161.42632202],
       [140.57857244],
       [191.5310887 ]])

In [25]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5508560481967166

In [26]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [27]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [28]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.560
Model:                            OLS   Adj. R-squared:                  0.559
Method:                 Least Squares   F-statistic:                     2972.
Date:                Thu, 02 Jun 2022   Prob (F-statistic):               0.00
Time:                        18:29:34   Log-Likelihood:                -22437.
No. Observations:                4682   AIC:                         4.488e+04
Df Residuals:                    4679   BIC:                         4.490e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               148.5960      1.09

In [29]:
#fit the model with training data
#decision tree regression
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [30]:
#predict
y_pred = regressor_DT.predict(X_test)
y_pred

array([225.4, 152.3, 340.3, ..., 170.9, 156.9, 188.1])

In [31]:
#calculate r2 score
r2_score(y_test, y_pred)

0.2000404441066257