In [2]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression



In [3]:
#read in all-cancers, CA, incicdence csv
incidence_CA_df = pd.read_csv("Resources/incidence_CA.csv")                           
incidence_CA_df                            

Unnamed: 0,County,FIPS,Met Healthy People Objective of ***?,"Age-Adjusted Incidence Rate([rate note]) - cases per 100,000",Lower 95% Confidence Interval,Upper 95% Confidence Interval,CI*Rank([rank note]),Lower CI (CI*Rank),Upper CI (CI*Rank),Average Annual Count,Recent Trend,Recent 5-Year Trend ([trend note]) in Incidence Rates,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
0,California(7),6000,***,398.4,394.9,401.8,,34,38,10786,falling,-2.9,-3.3,-2.6
1,US (SEER+NPCR)(1),0,***,444.9,444,445.9,,,,187408,falling,-1.5,-1.7,-1.3
2,San Francisco County(7),6075,***,508.8,484.2,534.4,1,1,5,336,falling,-1.1,-1.4,-0.8
3,Shasta County(7),6089,***,497.9,368.1,656.1,2,1,34,11,stable,-2.0,-5.2,1.4
4,San Joaquin County(7),6077,***,450.2,424.8,476.7,3,2,12,263,falling,-1.1,-1.9,-0.4
5,Marin County(7),6041,***,438.6,372.5,512.8,4,1,28,35,falling,-2.2,-3.5,-0.9
6,Siskiyou County(7),6093,***,437.8,262.6,682.4,5,1,39,4,stable,-0.2,-4.9,4.7
7,Solano County(7),6095,***,437.3,415.8,459.7,6,2,14,336,falling,-1.3,-2.1,-0.5
8,Alameda County(7),6001,***,422.7,410.5,435.2,7,4,15,966,falling,-2.6,-3.4,-1.7
9,Contra Costa County(7),6013,***,421.6,404.7,439,8,3,17,501,falling,-3.4,-5.4,-1.5


In [4]:
#get count of rows for incidence csv
incidence_CA_df.count()

County                                                          60
 FIPS                                                           60
Met Healthy People Objective of ***?                            60
Age-Adjusted Incidence Rate([rate note]) - cases per 100,000    60
Lower 95% Confidence Interval                                   60
Upper 95% Confidence Interval                                   60
CI*Rank([rank note])                                            60
Lower CI (CI*Rank)                                              60
Upper CI (CI*Rank)                                              60
Average Annual Count                                            60
Recent Trend                                                    60
Recent 5-Year Trend ([trend note]) in Incidence Rates           60
Lower 95% Confidence Interval.1                                 60
Upper 95% Confidence Interval.1                                 60
dtype: int64

In [5]:
#get column names
incidence_CA_df.columns

Index(['County', ' FIPS', 'Met Healthy People Objective of ***?',
       'Age-Adjusted Incidence Rate([rate note]) - cases per 100,000',
       'Lower 95% Confidence Interval', 'Upper 95% Confidence Interval',
       'CI*Rank([rank note])', 'Lower CI (CI*Rank)', 'Upper CI (CI*Rank)',
       'Average Annual Count', 'Recent Trend',
       'Recent 5-Year Trend ([trend note]) in Incidence Rates',
       'Lower 95% Confidence Interval.1', 'Upper 95% Confidence Interval.1'],
      dtype='object')

In [6]:
#preprocessing
#drop unwanted columns
columns_to_keep = ["County", "Age-Adjusted Incidence Rate([rate note]) - cases per 100,000","Average Annual Count", "Recent 5-Year Trend ([trend note]) in Incidence Rates"]
columns_to_keep
incidence_CA_df = incidence_CA_df[columns_to_keep]
#drop columns where age adjusted rate isn't provided
incidence_CA_df = incidence_CA_df[incidence_CA_df["Age-Adjusted Incidence Rate([rate note]) - cases per 100,000"] != "* "]
incidence_CA_df

Unnamed: 0,County,"Age-Adjusted Incidence Rate([rate note]) - cases per 100,000",Average Annual Count,Recent 5-Year Trend ([trend note]) in Incidence Rates
0,California(7),398.4,10786,-2.9
1,US (SEER+NPCR)(1),444.9,187408,-1.5
2,San Francisco County(7),508.8,336,-1.1
3,Shasta County(7),497.9,11,-2.0
4,San Joaquin County(7),450.2,263,-1.1
5,Marin County(7),438.6,35,-2.2
6,Siskiyou County(7),437.8,4,-0.2
7,Solano County(7),437.3,336,-1.3
8,Alameda County(7),422.7,966,-2.6
9,Contra Costa County(7),421.6,501,-3.4


In [7]:
# create target for first regression (predicting incidence)
y_incidence = incidence_CA_df['Age-Adjusted Incidence Rate([rate note]) - cases per 100,000'].values
y_incidence = pd.to_numeric(y_incidence)
y_incidence

array([398.4, 444.9, 508.8, 497.9, 450.2, 438.6, 437.8, 437.3, 422.7,
       421.6, 416.3, 414.1, 412.6, 411.8, 406.6, 404.4, 396.2, 394.2,
       385.9, 385.1, 381.5, 375.7, 373.5, 370.3, 368.2, 364.1, 357.8,
       356.5, 353.1, 345.2, 344.2, 340.6, 324.7, 323.1, 308.1, 298.2,
       287. , 277.4, 272.2, 270. , 244.3])

In [8]:
#get shape of target array
y_incidence.shape


(41,)

In [9]:
#create features array
X = incidence_CA_df[["Average Annual Count","Recent 5-Year Trend ([trend note]) in Incidence Rates"]].values
#change each row of X to numeric dtype
for i in range(0, len(X)):
    X[i] = pd.to_numeric(X[i])
X

array([[10786.0, -2.9],
       [187408.0, -1.5],
       [336.0, -1.1],
       [11.0, -2.0],
       [263.0, -1.1],
       [35.0, -2.2],
       [4.0, -0.2],
       [336.0, -1.3],
       [966.0, -2.6],
       [501.0, -3.4],
       [201.0, -1.5],
       [662.0, -2.7],
       [7.0, -1.6],
       [226.0, -0.7],
       [71.0, -0.5],
       [764.0, -4.5],
       [172.0, -1.7],
       [4145.0, -3.2],
       [22.0, -1.1],
       [661.0, -2.4],
       [79.0, -1.2],
       [9.0, -2.6],
       [41.0, -1.7],
       [38.0, -1.1],
       [612.0, -1.1],
       [10.0, -0.7],
       [109.0, -5.7],
       [15.0, -4.2],
       [13.0, -4.2],
       [28.0, -0.9],
       [217.0, -2.0],
       [55.0, -3.2],
       [34.0, -2.4],
       [17.0, -3.3],
       [18.0, -3.0],
       [23.0, -2.7],
       [8.0, -2.5],
       [22.0, -0.7],
       [12.0, -3.4],
       [12.0, -2.4],
       [6.0, -6.1]], dtype=object)

In [10]:
#get shape of feautures array
X.shape

(41, 2)

In [11]:
#create instance of linear regression model
model = LinearRegression()

In [12]:
#fit the incidence model
regression_incidence = model.fit(X,y_incidence)

In [13]:
#calculate coefficients
coefficients = regression_incidence.coef_
coefficients



array([3.26863873e-04, 1.66234384e+01])

In [14]:
#print coeffient interpretation for incidence count
print("The model predicts that a 1 unit increase in average annual incidence count increases age adjusted incidence rate by " + str(coefficients[0]) )

The model predicts that a 1 unit increase in average annual incidence count increases age adjusted incidence rate by 0.0003268638732126464


In [15]:
#print coeffient interpretation for incidence count
print("The model predicts that a 1 unit increase in the 5-year trend incidence rates increases age adjusted incidence rate by " + str(coefficients[1]) )

The model predicts that a 1 unit increase in the 5-year trend incidence rates increases age adjusted incidence rate by 16.6234383968304


In [16]:
#calculate intercept
regression_incidence.intercept_

412.2869990682226

In [17]:
#get R2 value
r2 = model.score(X, y_incidence)
print("This linear regression model produces an r-squared value of " + str(r2))

This linear regression model produces an r-squared value of 0.17582262496981693


In [18]:
#predict incidence measurement based on selected values for features
regression_incidence.predict([[502, 2]])





array([445.69796153])

In [20]:
#Create predictions array
y_incidence_predict = regression_incidence.predict(X)

In [21]:
y_incidence_predict

array([367.60458145, 448.60874622, 394.11104309, 379.04371778,
       394.08718203, 375.72687483, 408.96361884, 390.78635541,
       369.38180974, 355.93106732, 387.41754111, 367.62009928,
       385.69178568, 400.72446343, 403.9984872 , 337.73125028,
       384.08337438, 360.44684695, 394.00840784, 372.60680394,
       392.36469524, 369.06900101, 384.04055521, 394.01363766,
       394.20125752, 400.65386083, 317.56902837, 342.47346076,
       342.47280703, 397.3350567 , 379.11105174, 359.10997371,
       372.40186029, 357.43520904, 362.42256743, 367.41123327,
       370.73101799, 400.6577832 , 355.77123089, 372.39466928,
       310.88598603])

In [23]:
#get r2 value
from sklearn.metrics import r2_score
r2_score(y_incidence, y_incidence_predict)

0.17582262496981693