In [1]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm



In [2]:
#read in mortality csv from github

mortality_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/abramo_data_analysis/Abramo_Web_Scraping/Mortality_data.csv")                         
#get rid of commas in trend columns
mortality_df = mortality_df.replace(',','', regex=True)
mortality_df.head()                            

Unnamed: 0,County,Average Annual Count,Recent Trend,State,Cancer,Race/Ethnicity,Sex,Age,"Age-Adjusted Death Rate cases per 100,000",CI*Rank,Recent 5-Year Trend in Death Rates
0,Walker County,181,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,213.2,1,-0.3
1,Crenshaw County,32,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,211.2,2,0.0
2,Russell County,78,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,201.7,3,-0.6
3,Escambia County,71,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,199.5,4,-0.5
4,Jackson County,139,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,199.4,5,-0.5


In [3]:
#check column names
mortality_df.columns

Index(['County', 'Average Annual Count', 'Recent Trend', 'State', 'Cancer',
       'Race/Ethnicity', 'Sex', 'Age',
       'Age-Adjusted Death Rate cases per 100,000', 'CI*Rank',
       'Recent 5-Year Trend in Death Rates'],
      dtype='object')

In [6]:
#drop unnecessary columns
mortality_df = mortality_df.drop(columns=["CI*Rank"])

In [7]:
#view updated dataframe
mortality_df.head()

Unnamed: 0,County,Average Annual Count,Recent Trend,State,Cancer,Race/Ethnicity,Sex,Age,"Age-Adjusted Death Rate cases per 100,000",Recent 5-Year Trend in Death Rates
0,Walker County,181,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,213.2,-0.3
1,Crenshaw County,32,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,211.2,0.0
2,Russell County,78,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,201.7,-0.6
3,Escambia County,71,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,199.5,-0.5
4,Jackson County,139,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,199.4,-0.5


In [8]:
#get count of rows
mortality_df.count()

County                                       39595
Average Annual Count                         39595
Recent Trend                                 39595
State                                        39595
Cancer                                       39595
Race/Ethnicity                               39595
Sex                                          39595
Age                                          39595
Age-Adjusted Death Rate cases per 100,000    39595
Recent 5-Year Trend in Death Rates           39595
dtype: int64

In [10]:
#drop rows that lack mortality data
#NOTE: We may look for an alternative to dropping these rows in future segments
mortality_df = mortality_df[mortality_df["Age-Adjusted Death Rate cases per 100,000"] != "*"]
mortality_df = mortality_df[mortality_df["Recent 5-Year Trend in Death Rates"] != "*"]
#count rows again
mortality_df.count()

County                                       23282
Average Annual Count                         23282
Recent Trend                                 23282
State                                        23282
Cancer                                       23282
Race/Ethnicity                               23282
Sex                                          23282
Age                                          23282
Age-Adjusted Death Rate cases per 100,000    23282
Recent 5-Year Trend in Death Rates           23282
dtype: int64

In [11]:
#create target 
y = mortality_df[["Age-Adjusted Death Rate cases per 100,000"]]

#change dtype to numeric
y["Age-Adjusted Death Rate cases per 100,000"] = y["Age-Adjusted Death Rate cases per 100,000"].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,"Age-Adjusted Death Rate cases per 100,000"
0,213.2
1,211.2
2,201.7
3,199.5
4,199.4


In [15]:
#create feautures variable
#NOTE: Data gathering is not yet fully complete
##Final model features will also include gender, county statistics (population, MUA index, income metrics), and possibly type of cancer
X = mortality_df[["State", "Race/Ethnicity", "Recent 5-Year Trend in Death Rates"]]
X.head()

Unnamed: 0,State,Race/Ethnicity,Recent 5-Year Trend in Death Rates
0,Alabama,White (includes Hispanic),-0.3
1,Alabama,White (includes Hispanic),0.0
2,Alabama,White (includes Hispanic),-0.6
3,Alabama,White (includes Hispanic),-0.5
4,Alabama,White (includes Hispanic),-0.5


In [16]:
#change 5-year trend column to float
X['Recent 5-Year Trend in Death Rates'] = pd.to_numeric(X['Recent 5-Year Trend in Death Rates'])
X.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


State                                  object
Race/Ethnicity                         object
Recent 5-Year Trend in Death Rates    float64
dtype: object

In [17]:
#create dummy variables for state and race
X = pd.get_dummies(X, columns=['State','Race/Ethnicity'])
X.head()

Unnamed: 0,Recent 5-Year Trend in Death Rates,State_Alabama,State_Arizona,State_Arkansas,State_California,State_Colorado,State_Connecticut,State_Delaware,State_District of Columbia,State_Florida,...,State_Wisconsin,State_Wyoming,Race/Ethnicity_All Races (includes Hispanic),Race/Ethnicity_Amer. Indian / AK Native (includes Hispanic),Race/Ethnicity_Asian / Pacific Islander (includes Hispanic),Race/Ethnicity_Black (includes Hispanic),Race/Ethnicity_Hispanic (any race),Race/Ethnicity_White (includes Hispanic),Race/Ethnicity_ White Hispanic,Race/Ethnicity_ White Non-Hispanic
0,-0.3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.6,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,-0.5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [18]:
#get shape of target array
y.shape

(23282, 1)

In [19]:
#get shape of feautures array
X.shape

(23282, 58)

In [20]:
X.columns

Index(['Recent 5-Year Trend in Death Rates', 'State_Alabama', 'State_Arizona',
       'State_Arkansas', 'State_California', 'State_Colorado',
       'State_Connecticut', 'State_Delaware', 'State_District of Columbia',
       'State_Florida', 'State_Georgia', 'State_Hawaii ', 'State_Idaho',
       'State_Illinois', 'State_Indiana', 'State_Iowa ', 'State_Kansas ',
       'State_Kentucky', 'State_Maine', 'State_Maryland',
       'State_Massachusetts', 'State_Michigan', 'State_Minnesota ',
       'State_Mississippi', 'State_Missouri', 'State_Montana',
       'State_Nebraska', 'State_Nevada ', 'State_New Hampshire',
       'State_New Jersey', 'State_New Mexico', 'State_New York',
       'State_North Carolina', 'State_North Dakota', 'State_Ohio ',
       'State_Oklahoma', 'State_Oregon ', 'State_Pennsylvania',
       'State_Rhode Island', 'State_South Carolina', 'State_South Dakota',
       'State_Tennessee ', 'State_Texas', 'State_Utah', 'State_Vermont',
       'State_Virginia', 'State_Wash

In [24]:
#create model using sklearn
model = LinearRegression()

In [25]:
#fit the model
regression_mortality = model.fit(X,y)

In [26]:
#Create predictions array
y_pred = regression_mortality.predict(X)
y_pred

array([[174.0390625 ],
       [177.83984375],
       [170.2421875 ],
       ...,
       [517.8828125 ],
       [282.34570312],
       [287.41015625]])

In [27]:
#get R2 value
from sklearn.metrics import r2_score
r2_score(y, y_pred)

0.11471862551965717

In [28]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [30]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [31]:
#view model summary with statsmodels
print(model.summary())

                                        OLS Regression Results                                       
Dep. Variable:     Age-Adjusted Death Rate cases per 100,000   R-squared:                       0.115
Model:                                                   OLS   Adj. R-squared:                  0.113
Method:                                        Least Squares   F-statistic:                     53.74
Date:                                       Fri, 20 May 2022   Prob (F-statistic):               0.00
Time:                                               13:18:08   Log-Likelihood:            -1.6361e+05
No. Observations:                                      23282   AIC:                         3.273e+05
Df Residuals:                                          23225   BIC:                         3.278e+05
Df Model:                                                 56                                         
Covariance Type:                                   nonrobust                      