In [1]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [2]:
#read in incidence csv from github

mortality_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Sex_Mortality_Cancer_DB.csv")                         
#get rid of commas in columns
mortality_df = mortality_df.replace(',','', regex=True)
mortality_df.head()     

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019
0,18,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,296.3,Wilcox County AL,51.0,10896,10844,10691,10599,10373
1,13,Alabama,All Cancer Sites,All Races (includes Hispanic),Females,All Ages,153.2,Wilcox County AL,51.0,10896,10844,10691,10599,10373
2,32,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,290.7,Macon County AL,28.0,19296,19060,18793,18321,18068
3,22,Alabama,All Cancer Sites,All Races (includes Hispanic),Females,All Ages,146.1,Macon County AL,28.0,19296,19060,18793,18321,18068
4,17,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,283.8,Lowndes County AL,54.0,10350,10248,10097,9974,9726


In [3]:
#check dtypes
mortality_df.dtypes

average_annual_count                     int64
primary_state_name                      object
cancer                                  object
race_ethnicity                          object
sex                                     object
age                                     object
age_adjusted_rate                      float64
county_state                            object
index_of_medical_underservice_score    float64
popestimate2015                          int64
popestimate2016                          int64
popestimate2017                          int64
popestimate2018                          int64
popestimate2019                          int64
dtype: object

In [4]:
#get count of rows
mortality_df.count()

average_annual_count                   4691
primary_state_name                     4691
cancer                                 4691
race_ethnicity                         4691
sex                                    4691
age                                    4691
age_adjusted_rate                      4691
county_state                           4691
index_of_medical_underservice_score    4069
popestimate2015                        4691
popestimate2016                        4691
popestimate2017                        4691
popestimate2018                        4691
popestimate2019                        4691
dtype: int64

In [5]:
#get count of values for sex
mortality_df["sex"].value_counts()

Males      2368
Females    2323
Name: sex, dtype: int64

In [6]:
#get count of NA values for MUA
mortality_df["index_of_medical_underservice_score"].isna().sum()

622

In [7]:
#drop rows where MUA index is null
mortality_df = mortality_df[mortality_df["index_of_medical_underservice_score"].isna() == False]

In [8]:
#get count of rows
mortality_df.count()

average_annual_count                   4069
primary_state_name                     4069
cancer                                 4069
race_ethnicity                         4069
sex                                    4069
age                                    4069
age_adjusted_rate                      4069
county_state                           4069
index_of_medical_underservice_score    4069
popestimate2015                        4069
popestimate2016                        4069
popestimate2017                        4069
popestimate2018                        4069
popestimate2019                        4069
dtype: int64

In [9]:
#create target 
y = mortality_df[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
0,296.3
1,153.2
2,290.7
3,146.1
4,283.8


In [10]:
#create feautures variable holding sexes and mua score
X = mortality_df[["sex", "index_of_medical_underservice_score"]]
X.head()

Unnamed: 0,sex,index_of_medical_underservice_score
0,Males,51.0
1,Females,51.0
2,Males,28.0
3,Females,28.0
4,Males,54.0


In [11]:
#create dummy variables for sex
X = pd.get_dummies(X, columns=["sex"])
X.head()

Unnamed: 0,index_of_medical_underservice_score,sex_Females,sex_Males
0,51.0,0,1
1,51.0,1,0
2,28.0,0,1
3,28.0,1,0
4,54.0,0,1


In [12]:
#get shape of target array
y.shape

(4069, 1)

In [13]:
#get shape of feautures array
X.shape

(4069, 3)

In [14]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
#create linear regression model using sklearn
model = LinearRegression()

In [16]:
#fit the model with training data
regression_sex_MUA_mortality = model.fit(X_train, y_train)

In [17]:
#Create predictions array using linear regression
y_pred = regression_sex_MUA_mortality.predict(X_test)
y_pred

array([[201.67554289],
       [202.30892301],
       [141.00868813],
       ...,
       [141.24620568],
       [141.1274469 ],
       [141.56289574]])

In [18]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.4171174237346946

In [19]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [20]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [21]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.463
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     1756.
Date:                Sun, 29 May 2022   Prob (F-statistic):               0.00
Time:                        22:15:08   Log-Likelihood:                -19945.
No. Observations:                4069   AIC:                         3.990e+04
Df Residuals:                    4066   BIC:                         3.991e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
co

In [22]:
#fit the model with training data
#decision tree regression
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [23]:
#predict
y_pred = regressor_DT.predict(X_test)
y_pred

array([201.15140187, 194.72926829, 145.68531469, ..., 139.83603604,
       132.83      , 140.29074074])

In [24]:
#calculate r2 score
r2_score(y_test, y_pred)

0.40630627592980884