In [55]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split



In [56]:
#read in incidence csv from github

incidence_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/abramo_data_analysis/Abramo_Web_Scraping/Incidence_data.csv")                         
#get rid of commas in trend columns
incidence_df = incidence_df.replace(',','', regex=True)
incidence_df.head()                            

Unnamed: 0,County,Average Annual Count,Recent Trend,State,Cancer,Race/Ethnicity,Sex,Age,"Age-Adjusted Incidence Rate cases per 100,000",CI*Rank,Recent 5-Year Trend in Incidence Rates
0,Lowndes County 6,30,rising,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,681.2,1,11.7
1,Crenshaw County 6,80,rising,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,558.3,2,2.7
2,Greene County 6,15,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,542.5,3,1.5
3,Chambers County 6,158,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,519.5,4,0.3
4,Walker County 6,418,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,509.3,5,0.4


In [57]:
#check column names
incidence_df.columns

Index(['County', 'Average Annual Count', 'Recent Trend', 'State', 'Cancer',
       'Race/Ethnicity', 'Sex', 'Age',
       'Age-Adjusted Incidence Rate cases per 100,000', 'CI*Rank',
       'Recent 5-Year Trend in Incidence Rates'],
      dtype='object')

In [58]:
#drop unnecessary columns
incidence_df = incidence_df.drop(columns=["CI*Rank"])

In [59]:
#view updated dataframe
incidence_df.head()

Unnamed: 0,County,Average Annual Count,Recent Trend,State,Cancer,Race/Ethnicity,Sex,Age,"Age-Adjusted Incidence Rate cases per 100,000",Recent 5-Year Trend in Incidence Rates
0,Lowndes County 6,30,rising,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,681.2,11.7
1,Crenshaw County 6,80,rising,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,558.3,2.7
2,Greene County 6,15,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,542.5,1.5
3,Chambers County 6,158,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,519.5,0.3
4,Walker County 6,418,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,509.3,0.4


In [60]:
#get count of rows
incidence_df.count()

County                                           37803
Average Annual Count                             37803
Recent Trend                                     37803
State                                            37803
Cancer                                           37803
Race/Ethnicity                                   37803
Sex                                              37803
Age                                              37803
Age-Adjusted Incidence Rate cases per 100,000    37803
Recent 5-Year Trend in Incidence Rates           37803
dtype: int64

In [61]:
#drop rows that lack incidence data
#NOTE: We may look for an alternative to dropping these rows in future segments
incidence_df = incidence_df[incidence_df["Age-Adjusted Incidence Rate cases per 100,000"] != "*"]
incidence_df = incidence_df[incidence_df["Recent 5-Year Trend in Incidence Rates"] != "*"]
#count rows again
incidence_df.count()

County                                           24545
Average Annual Count                             24545
Recent Trend                                     24545
State                                            24545
Cancer                                           24545
Race/Ethnicity                                   24545
Sex                                              24545
Age                                              24545
Age-Adjusted Incidence Rate cases per 100,000    24545
Recent 5-Year Trend in Incidence Rates           24545
dtype: int64

In [62]:
#create target 
y = incidence_df[['Age-Adjusted Incidence Rate cases per 100,000']]

#change dtype to numeric
y['Age-Adjusted Incidence Rate cases per 100,000'] = y['Age-Adjusted Incidence Rate cases per 100,000'].astype(float)
y = y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [63]:
#create feautures variable
#NOTE: Data gathering is not yet fully complete
##Final model features will also include gender, county statistics (population, MUA index, income metrics), and possibly type of cancer
X = incidence_df[["State", "Race/Ethnicity", "Recent 5-Year Trend in Incidence Rates"]]
X.head()

Unnamed: 0,State,Race/Ethnicity,Recent 5-Year Trend in Incidence Rates
0,Alabama,White (includes Hispanic),11.7
1,Alabama,White (includes Hispanic),2.7
2,Alabama,White (includes Hispanic),1.5
3,Alabama,White (includes Hispanic),0.3
4,Alabama,White (includes Hispanic),0.4


In [64]:
#change 5-year trend column to float
X['Recent 5-Year Trend in Incidence Rates'] = pd.to_numeric(X['Recent 5-Year Trend in Incidence Rates'])
X.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


State                                      object
Race/Ethnicity                             object
Recent 5-Year Trend in Incidence Rates    float64
dtype: object

In [65]:
#create dummy variables for state and race
X = pd.get_dummies(X, columns=['State','Race/Ethnicity'])
X = X.values

In [66]:
#get shape of target array
y.shape

(24545, 1)

In [67]:
#get shape of feautures array
X.shape

(24545, 54)

In [68]:
# import the regressor
from sklearn.tree import DecisionTreeRegressor 
regressor = DecisionTreeRegressor()
regressor.fit(X, y)

DecisionTreeRegressor()

In [69]:
#predict
y_pred = regressor.predict(X)

In [70]:

#calculate r2 score
r2_score(y, y_pred)

0.3356906026628729

In [71]:
#run again with split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [72]:
#fit the model with trained data
regressor2 = DecisionTreeRegressor()
regressor2.fit(X_train, y_train)

DecisionTreeRegressor()

In [73]:
#make pred with test data
y_pred_2 = regressor.predict(X_test)

In [75]:

#calculate r2 score
r2_score(y_test, y_pred_2)

0.33508153333213675