In [2]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split




In [3]:
#read in mortality csv from github

mortality_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/abramo_data_analysis/Abramo_Web_Scraping/Mortality_data.csv")                         
#get rid of commas in trend columns
mortality_df = mortality_df.replace(',','', regex=True)
mortality_df.head()                            

Unnamed: 0,County,Average Annual Count,Recent Trend,State,Cancer,Race/Ethnicity,Sex,Age,"Age-Adjusted Death Rate cases per 100,000",CI*Rank,Recent 5-Year Trend in Death Rates
0,Walker County,181,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,213.2,1,-0.3
1,Crenshaw County,32,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,211.2,2,0.0
2,Russell County,78,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,201.7,3,-0.6
3,Escambia County,71,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,199.5,4,-0.5
4,Jackson County,139,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,199.4,5,-0.5


In [4]:
#check column names
mortality_df.columns

Index(['County', 'Average Annual Count', 'Recent Trend', 'State', 'Cancer',
       'Race/Ethnicity', 'Sex', 'Age',
       'Age-Adjusted Death Rate cases per 100,000', 'CI*Rank',
       'Recent 5-Year Trend in Death Rates'],
      dtype='object')

In [5]:
#drop unnecessary columns
mortality_df = mortality_df.drop(columns=["CI*Rank"])

In [6]:
#view updated dataframe
mortality_df.head()

Unnamed: 0,County,Average Annual Count,Recent Trend,State,Cancer,Race/Ethnicity,Sex,Age,"Age-Adjusted Death Rate cases per 100,000",Recent 5-Year Trend in Death Rates
0,Walker County,181,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,213.2,-0.3
1,Crenshaw County,32,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,211.2,0.0
2,Russell County,78,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,201.7,-0.6
3,Escambia County,71,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,199.5,-0.5
4,Jackson County,139,falling,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,199.4,-0.5


In [7]:
#get count of rows
mortality_df.count()

County                                       39595
Average Annual Count                         39595
Recent Trend                                 39595
State                                        39595
Cancer                                       39595
Race/Ethnicity                               39595
Sex                                          39595
Age                                          39595
Age-Adjusted Death Rate cases per 100,000    39595
Recent 5-Year Trend in Death Rates           39595
dtype: int64

In [8]:
#drop rows that lack mortality data
#NOTE: We may look for an alternative to dropping these rows in future segments
mortality_df = mortality_df[mortality_df["Age-Adjusted Death Rate cases per 100,000"] != "*"]
mortality_df = mortality_df[mortality_df["Recent 5-Year Trend in Death Rates"] != "*"]
#count rows again
mortality_df.count()

County                                       23282
Average Annual Count                         23282
Recent Trend                                 23282
State                                        23282
Cancer                                       23282
Race/Ethnicity                               23282
Sex                                          23282
Age                                          23282
Age-Adjusted Death Rate cases per 100,000    23282
Recent 5-Year Trend in Death Rates           23282
dtype: int64

In [9]:
#create target 
y = mortality_df[["Age-Adjusted Death Rate cases per 100,000"]]

#change dtype to numeric
y["Age-Adjusted Death Rate cases per 100,000"] = y["Age-Adjusted Death Rate cases per 100,000"].astype(float)
y = y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
#create feautures variable
#NOTE: Data gathering is not yet fully complete
##Final model features will also include gender, county statistics (population, MUA index, income metrics), and possibly type of cancer
X = mortality_df[["State", "Race/Ethnicity", "Recent 5-Year Trend in Death Rates"]]


In [11]:
#change 5-year trend column to float
X['Recent 5-Year Trend in Death Rates'] = pd.to_numeric(X['Recent 5-Year Trend in Death Rates'])
X.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


State                                  object
Race/Ethnicity                         object
Recent 5-Year Trend in Death Rates    float64
dtype: object

In [12]:
#create dummy variables for state and race
X = pd.get_dummies(X, columns=['State','Race/Ethnicity'])
X.head()

Unnamed: 0,Recent 5-Year Trend in Death Rates,State_Alabama,State_Arizona,State_Arkansas,State_California,State_Colorado,State_Connecticut,State_Delaware,State_District of Columbia,State_Florida,...,State_Wisconsin,State_Wyoming,Race/Ethnicity_All Races (includes Hispanic),Race/Ethnicity_Amer. Indian / AK Native (includes Hispanic),Race/Ethnicity_Asian / Pacific Islander (includes Hispanic),Race/Ethnicity_Black (includes Hispanic),Race/Ethnicity_Hispanic (any race),Race/Ethnicity_White (includes Hispanic),Race/Ethnicity_ White Hispanic,Race/Ethnicity_ White Non-Hispanic
0,-0.3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.6,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,-0.5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [13]:
#get shape of target array
y.shape

(23282, 1)

In [14]:
#get shape of feautures array
X = X.values

In [15]:
# import the regressor
from sklearn.tree import DecisionTreeRegressor 
regressor = DecisionTreeRegressor()
regressor.fit(X, y)

DecisionTreeRegressor()

In [16]:
y_pred = regressor.predict(X)

In [17]:
r2_score(y, y_pred)

0.2914609622429646

In [18]:
#run again with split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
#fit the model with trained data
regressor2 = DecisionTreeRegressor()
regressor2.fit(X_train, y_train)

DecisionTreeRegressor()

In [20]:
#make pred with test data
y_pred_2 = regressor.predict(X_test)

In [21]:

#calculate r2 score
r2_score(y_test, y_pred_2)

0.29004957749103966