In [35]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [4]:
#read in incidence csv from github

incidence_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Sex_Incidence_Cancer_DB.csv")                         
#get rid of commas in columns
incidence_df = incidence_df.replace(',','', regex=True)
incidence_df.head()     

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019
0,47,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,730.1,Lowndes County AL,54.0,10350,10248,10097,9974,9726
1,35,Alabama,All Cancer Sites,All Races (includes Hispanic),Females,All Ages,504.8,Lowndes County AL,54.0,10350,10248,10097,9974,9726
2,129,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,607.4,Chambers County AL,59.5,33996,33745,33707,33600,33254
3,107,Alabama,All Cancer Sites,All Races (includes Hispanic),Females,All Ages,437.8,Chambers County AL,59.5,33996,33745,33707,33600,33254
4,256,Alabama,All Cancer Sites,All Races (includes Hispanic),Males,All Ages,591.1,Elmore County AL,48.0,80872,81204,81422,81011,81209


In [5]:
#check dtypes
incidence_df.dtypes

average_annual_count                     int64
primary_state_name                      object
cancer                                  object
race_ethnicity                          object
sex                                     object
age                                     object
age_adjusted_rate                       object
county_state                            object
index_of_medical_underservice_score    float64
popestimate2015                          int64
popestimate2016                          int64
popestimate2017                          int64
popestimate2018                          int64
popestimate2019                          int64
dtype: object

In [6]:
#get count of rows
incidence_df.count()

average_annual_count                   5243
primary_state_name                     5243
cancer                                 5243
race_ethnicity                         5243
sex                                    5243
age                                    5243
age_adjusted_rate                      5243
county_state                           5243
index_of_medical_underservice_score    4513
popestimate2015                        5243
popestimate2016                        5243
popestimate2017                        5243
popestimate2018                        5243
popestimate2019                        5243
dtype: int64

In [7]:
#get count of values for sex
incidence_df["sex"].value_counts()

Males      2626
Females    2617
Name: sex, dtype: int64

In [8]:
#get count of NA values for MUA
incidence_df["index_of_medical_underservice_score"].isna().sum()

730

In [9]:
#drop rows where MUA index is null
incidence_df = incidence_df[incidence_df["index_of_medical_underservice_score"].isna() == False]

In [10]:
#get count of rows
incidence_df.count()

average_annual_count                   4513
primary_state_name                     4513
cancer                                 4513
race_ethnicity                         4513
sex                                    4513
age                                    4513
age_adjusted_rate                      4513
county_state                           4513
index_of_medical_underservice_score    4513
popestimate2015                        4513
popestimate2016                        4513
popestimate2017                        4513
popestimate2018                        4513
popestimate2019                        4513
dtype: int64

In [11]:
#create target 
y = incidence_df[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
0,730.1
1,504.8
2,607.4
3,437.8
4,591.1


In [12]:
#create feautures variable holding sex and mua index
X = incidence_df[["sex", "index_of_medical_underservice_score"]]
X.head()

Unnamed: 0,sex,index_of_medical_underservice_score
0,Males,54.0
1,Females,54.0
2,Males,59.5
3,Females,59.5
4,Males,48.0


In [13]:
#create dummy variables for age
X = pd.get_dummies(X, columns=["sex"])
X.head()

Unnamed: 0,index_of_medical_underservice_score,sex_Females,sex_Males
0,54.0,0,1
1,54.0,1,0
2,59.5,0,1
3,59.5,1,0
4,48.0,0,1


In [14]:
#get shape of target array
y.shape

(4513, 1)

In [15]:
#get shape of feautures array
X.shape

(4513, 3)

In [16]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
#create linear regression model using sklearn
model = LinearRegression()

In [18]:
#fit the model with training data
regression_sex_MUA_incidence = model.fit(X_train, y_train)

In [20]:
#Create predictions array using linear regression
y_pred = regression_sex_MUA_incidence.predict(X_test)
y_pred

array([[495.85129328],
       [496.0556975 ],
       [495.86018042],
       ...,
       [495.14328446],
       [495.90362866],
       [495.01293974]])

In [21]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.22011566863435184

In [26]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [27]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [28]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.233
Model:                            OLS   Adj. R-squared:                  0.232
Method:                 Least Squares   F-statistic:                     683.7
Date:                Sun, 29 May 2022   Prob (F-statistic):          4.62e-260
Time:                        14:56:16   Log-Likelihood:                -25392.
No. Observations:                4513   AIC:                         5.079e+04
Df Residuals:                    4510   BIC:                         5.081e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
co

In [29]:
#fit the model with training data
#decision tree regression
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [30]:
#predict
y_pred = regressor_DT.predict(X_test)
y_pred

array([477.8       , 485.0765625 , 475.96935484, ..., 583.04285714,
       555.2       , 495.86      ])

In [31]:
#calculate r2 score
r2_score(y_test, y_pred)

0.1413983718580092