In [1]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm



In [2]:
#read in age incidence csv from github

incidence_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/abramo_data_analysis/Abramo_Web_Scraping/Incidence_data.csv")                         
#get rid of commas in trend columns
incidence_df = incidence_df.replace(',','', regex=True)
incidence_df.head()                            

Unnamed: 0,County,Average Annual Count,Recent Trend,State,Cancer,Race/Ethnicity,Sex,Age,"Age-Adjusted Incidence Rate cases per 100,000",CI*Rank,Recent 5-Year Trend in Incidence Rates
0,Lowndes County 6,30,rising,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,681.2,1,11.7
1,Crenshaw County 6,80,rising,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,558.3,2,2.7
2,Greene County 6,15,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,542.5,3,1.5
3,Chambers County 6,158,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,519.5,4,0.3
4,Walker County 6,418,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,509.3,5,0.4


In [3]:
#check column names
incidence_df.columns

Index(['County', 'Average Annual Count', 'Recent Trend', 'State', 'Cancer',
       'Race/Ethnicity', 'Sex', 'Age',
       'Age-Adjusted Incidence Rate cases per 100,000', 'CI*Rank',
       'Recent 5-Year Trend in Incidence Rates'],
      dtype='object')

In [4]:
#drop unnecessary columns
incidence_df = incidence_df.drop(columns=["CI*Rank"])

In [5]:
#view updated dataframe
incidence_df.head()

Unnamed: 0,County,Average Annual Count,Recent Trend,State,Cancer,Race/Ethnicity,Sex,Age,"Age-Adjusted Incidence Rate cases per 100,000",Recent 5-Year Trend in Incidence Rates
0,Lowndes County 6,30,rising,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,681.2,11.7
1,Crenshaw County 6,80,rising,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,558.3,2.7
2,Greene County 6,15,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,542.5,1.5
3,Chambers County 6,158,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,519.5,0.3
4,Walker County 6,418,stable,Alabama,All Cancer Sites,White (includes Hispanic),All Sexes,All Ages,509.3,0.4


In [6]:
#get count of rows
incidence_df.count()

County                                           37803
Average Annual Count                             37803
Recent Trend                                     37803
State                                            37803
Cancer                                           37803
Race/Ethnicity                                   37803
Sex                                              37803
Age                                              37803
Age-Adjusted Incidence Rate cases per 100,000    37803
Recent 5-Year Trend in Incidence Rates           37803
dtype: int64

In [7]:
#drop rows that lack incidence data
#NOTE: We may look for an alternative to dropping these rows in future segments
incidence_df = incidence_df[incidence_df["Age-Adjusted Incidence Rate cases per 100,000"] != "*"]
incidence_df = incidence_df[incidence_df["Recent 5-Year Trend in Incidence Rates"] != "*"]
#count rows again
incidence_df.count()

County                                           24545
Average Annual Count                             24545
Recent Trend                                     24545
State                                            24545
Cancer                                           24545
Race/Ethnicity                                   24545
Sex                                              24545
Age                                              24545
Age-Adjusted Incidence Rate cases per 100,000    24545
Recent 5-Year Trend in Incidence Rates           24545
dtype: int64

In [8]:
#create target 
y = incidence_df[['Age-Adjusted Incidence Rate cases per 100,000']]

#change dtype to numeric
y['Age-Adjusted Incidence Rate cases per 100,000'] = y['Age-Adjusted Incidence Rate cases per 100,000'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,"Age-Adjusted Incidence Rate cases per 100,000"
0,681.2
1,558.3
2,542.5
3,519.5
4,509.3


In [1]:
#create feautures variable
#NOTE: Data gathering is not yet fully complete
##Final model features will also include gender, county statistics (population, MUA index, income metrics), and possibly type of cancer
X = incidence_df[["State", "Race/Ethnicity", "Recent 5-Year Trend in Incidence Rates"]]
X.head()

NameError: name 'incidence_df' is not defined

In [10]:
#change 5-year trend column to float
X['Recent 5-Year Trend in Incidence Rates'] = pd.to_numeric(X['Recent 5-Year Trend in Incidence Rates'])
X.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


State                                      object
Race/Ethnicity                             object
Recent 5-Year Trend in Incidence Rates    float64
dtype: object

In [11]:
#create dummy variables for state and race
X = pd.get_dummies(X, columns=['State','Race/Ethnicity'])
X.head()

Unnamed: 0,Recent 5-Year Trend in Incidence Rates,State_Alabama,State_Arizona,State_Arkansas,State_California,State_Colorado,State_Connecticut,State_Delaware,State_District of Columbia,State_Florida,...,State_Wisconsin,State_Wyoming,Race/Ethnicity_All Races (includes Hispanic),Race/Ethnicity_Amer. Indian / AK Native (includes Hispanic),Race/Ethnicity_Asian / Pacific Islander (includes Hispanic),Race/Ethnicity_Black (includes Hispanic),Race/Ethnicity_Hispanic (any race),Race/Ethnicity_White (includes Hispanic),Race/Ethnicity_ White Hispanic,Race/Ethnicity_ White Non-Hispanic
0,11.7,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2.7,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1.5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
#get shape of target array
y.shape

(24545, 1)

In [13]:
#get shape of feautures array
X.shape

(24545, 54)

In [14]:
X.columns

Index(['Recent 5-Year Trend in Incidence Rates', 'State_Alabama',
       'State_Arizona', 'State_Arkansas', 'State_California', 'State_Colorado',
       'State_Connecticut', 'State_Delaware', 'State_District of Columbia',
       'State_Florida', 'State_Georgia', 'State_Hawaii', 'State_Idaho',
       'State_Illinois', 'State_Indiana', 'State_Iowa', 'State_Kentucky',
       'State_Maine', 'State_Maryland', 'State_Massachusetts',
       'State_Michigan', 'State_Missouri', 'State_Montana', 'State_Nebraska',
       'State_New Hampshire', 'State_New Jersey', 'State_New Mexico',
       'State_New York', 'State_North Carolina', 'State_North Dakota',
       'State_Ohio', 'State_Oklahoma', 'State_Oregon ', 'State_Pennsylvania',
       'State_Rhode Island', 'State_South Carolina', 'State_South Dakota',
       'State_Tennessee ', 'State_Texas', 'State_Utah', 'State_Vermont',
       'State_Virginia', 'State_Washington', 'State_West Virginia',
       'State_Wisconsin', 'State_Wyoming',
       'Race/

In [15]:
#create model using sklearn
model = LinearRegression()

In [16]:
#fit the model
regression_incidence = model.fit(X,y)

In [17]:
#Create predictions array
y_pred = regression_incidence.predict(X)
y_pred

array([[297.75331116],
       [395.35047913],
       [408.36366272],
       ...,
       [711.2000885 ],
       [725.29774475],
       [699.27186584]])

In [18]:
#get R2 value
from sklearn.metrics import r2_score
r2_score(y, y_pred)

0.09863299696641592

In [19]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [20]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [21]:
#view model summary with statsmodels
print(model.summary())

                                          OLS Regression Results                                         
Dep. Variable:     Age-Adjusted Incidence Rate cases per 100,000   R-squared:                       0.099
Model:                                                       OLS   Adj. R-squared:                  0.097
Method:                                            Least Squares   F-statistic:                     51.54
Date:                                           Mon, 23 May 2022   Prob (F-statistic):               0.00
Time:                                                   20:36:29   Log-Likelihood:            -1.8972e+05
No. Observations:                                          24545   AIC:                         3.796e+05
Df Residuals:                                              24492   BIC:                         3.800e+05
Df Model:                                                     52                                         
Covariance Type:                              