In [1]:
import pandas as pd
import requests

In [5]:
url = 'https://api.census.gov/data/timeseries/poverty/saipe?get=NAME,SAEMHI_PT,SAEPOVRT0_17_PT,SAEPOVRT0_4_PT,SAEPOVRT5_17R_PT,SAEPOVRTALL_PT,YEAR&for=state:*'

In [6]:
response = requests.get(url)

In [7]:
response

<Response [200]>

In [8]:
data = response.json()
columns = data[0]
df = pd.DataFrame(data[1:], columns = columns)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1479 entries, 0 to 1478
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   NAME              1479 non-null   object
 1   SAEMHI_PT         1479 non-null   object
 2   SAEPOVRT0_17_PT   1479 non-null   object
 3   SAEPOVRT0_4_PT    1479 non-null   object
 4   SAEPOVRT5_17R_PT  1479 non-null   object
 5   SAEPOVRTALL_PT    1479 non-null   object
 6   YEAR              1479 non-null   object
 7   state             1479 non-null   object
dtypes: object(8)
memory usage: 92.6+ KB


In [40]:
df.rename(columns = {
                        'NAME' : 'STATE',
                        'SAEMHI_PT' : 'MEDIAN HOUSEHOLD INCOME',
                        'SAEPOVRT0_17_PT' : '0-17', 
                        'SAEPOVRT0_4_PT' : '0-4', 
                        'SAEPOVRT5_17R_PT' : '5-17', 
                        'SAEPOVRTALL_PT' : 'ALL'
                    
                    }, inplace = True)

In [42]:
df.drop('state', axis = 1, inplace = True) 

In [85]:
df[df['STATE'] == 'Alaska']

Unnamed: 0,STATE,MEDIAN HOUSEHOLD INCOME,0-17,0-4,5-17,ALL,YEAR
1,Alaska,33885,15.4,17.7,12.9,10.6,1989
52,Alaska,39431,15.9,20.3,13.3,11.2,1993
103,Alaska,42255,13.2,15.7,11.2,10.1,1995
154,Alaska,44797,14.8,16.8,13.3,10.6,1996
205,Alaska,43657,16.2,17.3,15.1,11.2,1997
256,Alaska,47177,14.6,15.5,13.6,10.8,1998
307,Alaska,49133,11.2,12.0,10.1,8.8,1999
358,Alaska,51433,11.5,13.5,10.0,8.5,2000
409,Alaska,52332,11.5,13.5,10.0,8.7,2001
460,Alaska,51844,11.7,13.3,10.5,9.3,2002


In [44]:
df_dummy = pd.get_dummies(df, columns = ['STATE'], drop_first=True)

In [72]:
df_dummy = df_dummy.astype({'YEAR' : 'int', 'MEDIAN HOUSEHOLD INCOME' : 'int64'})

In [79]:
df_dummy

Unnamed: 0,MEDIAN HOUSEHOLD INCOME,0-17,0-4,5-17,ALL,YEAR,STATE_Alaska,STATE_Arizona,STATE_Arkansas,STATE_California,...,STATE_South Dakota,STATE_Tennessee,STATE_Texas,STATE_Utah,STATE_Vermont,STATE_Virginia,STATE_Washington,STATE_West Virginia,STATE_Wisconsin,STATE_Wyoming
0,22202,24.1,25.9,22.7,17.7,1989,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33885,15.4,17.7,12.9,10.6,1989,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28924,23.3,24.0,21.5,14.7,1989,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20729,24.4,27.6,22.3,17.9,1989,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,33474,21.3,22.9,18.7,12.7,1989,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,80926,13.3,14.4,12.6,10.3,2021,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1475,84155,12.0,12.7,11.2,9.9,2021,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1476,51122,21.4,24.7,20.0,16.8,2021,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1477,67150,13.6,14.3,12.9,10.8,2021,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [74]:
X = df_dummy.drop(columns = ['0-17', '0-4', '5-17','ALL'])
y = df_dummy['ALL']

In [75]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [77]:
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.8668648167261908

In [78]:
model.score(X_test, y_test)

0.84597446051763

In [90]:
def predict(year, income, state, X = X):
    newdf = X[X[f'STATE_{state}'] == 1].head(1)
    newdf['YEAR'] = year
    newdf['MEDIAN HOUSEHOLD INCOME'] = income

    result = model.predict(newdf)
    return result


print(predict(2021, 78437, 'Alaska'))

[10.34320061]
