In [2]:
pathname='/Users/rishigummakonda/Documents/Springboard/capstone/data/census_tract/from_atlas/'
nyc = pd.read_csv(pathname+'nyc_tracts.csv')
la = pd.read_csv(pathname+'la_tracts.csv')
den = pd.read_csv(pathname+'denver_tracts.csv')
nash=pd.read_csv(pathname+'nashville_tracts.csv')
atlas = pd.read_csv('/Users/rishigummakonda/Documents/Springboard/capstone/data/atlas_data/atlas.csv')

In [3]:
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import seaborn as sns
from statsmodels.graphics.regressionplots import *
import statsmodels.api as sm
from statsmodels.formula.api import ols

# special matplotlib argument for improved plots
from matplotlib import rcParams
sns.set_style("whitegrid")
sns.set_context("paper")
%matplotlib inline

In [7]:
from sklearn.linear_model import LinearRegression as lr
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split as tts

The codes in the 'tract' column within each dataset contain the state code, county, and tract code. The function below splits it up and creates a new dataframe. 

In [5]:
def split_codes(full_codes):
    state_codes=[]
    county_codes=[]
    tract_codes=[]
    for code in full_codes:
        code=str(code)
        state=int(code[:2])
        county=int(code[2:5])
        tract=int(code[5:11])
        state_codes.append(state)
        county_codes.append(county)
        tract_codes.append(tract)
    dict_codes = {'full_codes': full_codes, 'state':state_codes, 'county': county_codes, 'tract': tract_codes}
    df= pd.DataFrame(dict_codes)
    return df

The function below filters through the atlas dataset, finding the relevant tracts.

In [6]:
def filter_atlas(city_df):
    list_of_tracts=[]
    for index, row in city_df.iterrows():
        state_f = atlas[atlas['state'] == row['state']]
        county_f = state_f[state_f['county'] == row['county']]
        tract = county_f[county_f['tract'] == row['tract']] 
        list_of_tracts.append(tract)
    df = pd.concat(list_of_tracts,ignore_index=True)
    return df

Each dataset contains tracts which are not within the city lines. The next few code blocks are filtering those cities out and then splitting the codes up into state, county and tracts.

In [8]:
nyc_names =nyc["Name"]
new_nyc = nyc[nyc_names.str.contains("New York") | nyc_names.str.contains("Queens") 
              | nyc_names.str.contains("Bronx County")]
nyc_tracts = split_codes(list(new_nyc['tract']))
nyc_tracts.shape

(1291, 4)

In [9]:
la_names =la["Name"]
new_la = la[la_names.str.contains("Los Angeles")]
la_codes = list(new_la['tract'])
la_tracts=split_codes(la_codes)
la_tracts.shape

(1046, 4)

In [60]:
new_la

Unnamed: 0,tract,Name,Household_Income_rP_gP_pall
0,6037262301,"Los Angeles, CA",85044.0
1,6037265100,"Westwood, Los Angeles, CA",81626.0
3,6037262302,"Brentwood, Los Angeles, CA",77935.0
4,6037265420,"Westwood, Los Angeles, CA",76807.0
7,6037139801,"Tarzana, Los Angeles, CA",75027.0
...,...,...,...
3258,6037980028,"Los Angeles, CA",
3260,6037265304,"Westwood, Los Angeles, CA",
3262,6037222700,"South Los Angeles, Los Angeles, CA",
3265,6037980009,"Los Angeles, CA",


In [10]:
den_names =den["Name"]
new_den = den[den_names.str.contains("Denver")]
denver_codes=list(new_den['tract'])
denver_tracts=split_codes(denver_codes)
denver_tracts.shape

(168, 4)

In [11]:
nash_names =nash["Name"]
new_nash = nash[nash_names.str.contains("Nashville")]
nashville_codes=list(new_nash['tract'])
nashville_tracts=split_codes(nashville_codes)
nashville_tracts.shape

(154, 4)

This creates new datasets from the atlas dataset.

In [13]:
nyc_atlas= filter_atlas(nyc_tracts)
la_atlas=filter_atlas(la_tracts)
den_atlas=filter_atlas(denver_tracts)
nash_atlas=filter_atlas(nashville_tracts)
nyc_atlas.head()

Unnamed: 0,tract,county,state,cz,czname,hhinc_mean2000,mean_commutetime2000,frac_coll_plus2010,frac_coll_plus2000,foreign_share2010,...,kfr_white_p25,kfr_white_p75,kfr_white_p100,count_pooled,count_white,count_black,count_asian,count_hisp,count_natam,mergecounts
0,12800,61,36,19400.0,New York,218632.0,27.595802,0.874032,0.77653,0.146937,...,37659.164,64301.359,98215.914,1058.0,965.0,6.0,27.0,25.0,0.0,matched (3)
1,91601,81,36,19400.0,New York,104978.23,47.702164,0.472294,0.328169,0.016456,...,56034.027,76800.07,99922.594,847.0,831.0,1.0,2.0,13.0,0.0,matched (3)
2,92800,81,36,19400.0,New York,141231.67,41.729954,0.524654,0.567058,0.06993,...,55752.215,74925.844,95604.914,826.0,793.0,1.0,10.0,15.0,0.0,matched (3)
3,15002,61,36,19400.0,New York,257709.25,29.857792,0.850937,0.855583,0.070347,...,56656.738,71209.789,85812.844,1214.0,1119.0,2.0,32.0,36.0,0.0,matched (3)
4,97300,81,36,19400.0,New York,126643.05,37.980469,0.22209,0.272311,0.325692,...,62732.512,82491.477,104686.03,422.0,327.0,2.0,36.0,51.0,0.0,matched (3)


Cleaning NaN data

In [18]:
nyc_atlas.fillna(0,inplace=True)

In [33]:
nash_atlas.fillna(0,inplace=True)

In [47]:
nyc_atlas.to_csv(r'/Users/rishigummakonda/Documents/Springboard/capstone/data/atlas_data/cleaned_cities/nyc_atlas.csv',index=False)

In [48]:
la_atlas.to_csv(r'/Users/rishigummakonda/Documents/Springboard/capstone/data/atlas_data/cleaned_cities/la_atlas.csv',index=False)

In [49]:
la_atlas

Unnamed: 0,tract,county,state,cz,czname,hhinc_mean2000,mean_commutetime2000,frac_coll_plus2010,frac_coll_plus2000,foreign_share2010,...,kfr_white_p25,kfr_white_p75,kfr_white_p100,count_pooled,count_white,count_black,count_asian,count_hisp,count_natam,mergecounts


In [50]:
nash_atlas

Unnamed: 0,tract,county,state,cz,czname,hhinc_mean2000,mean_commutetime2000,frac_coll_plus2010,frac_coll_plus2000,foreign_share2010,...,kfr_white_p25,kfr_white_p75,kfr_white_p100,count_pooled,count_white,count_black,count_asian,count_hisp,count_natam,mergecounts
0,18601,37,47,5600.0,Nashville,228245.640,22.891167,0.652786,0.746885,0.046628,...,68247.898,76014.648,83160.313,609.0,575.0,7.0,9.0,13.0,2.0,matched (3)
1,18500,37,47,5600.0,Nashville,189272.030,20.895142,0.784240,0.727606,0.025046,...,40689.504,63979.094,91676.477,1414.0,1390.0,4.0,3.0,13.0,0.0,matched (3)
2,18700,37,47,5600.0,Nashville,193373.880,19.684816,0.717127,0.672531,0.024554,...,45323.555,62180.992,80232.609,789.0,738.0,15.0,14.0,10.0,2.0,matched (3)
3,17901,37,47,5600.0,Nashville,128443.270,19.108727,0.714427,0.645180,0.077767,...,35184.238,59908.102,90668.016,673.0,640.0,6.0,13.0,8.0,0.0,matched (3)
4,17902,37,47,5600.0,Nashville,142728.550,21.048346,0.727502,0.650513,0.045465,...,53488.246,65263.336,76754.680,647.0,612.0,4.0,7.0,12.0,1.0,matched (3)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,13602,37,47,5600.0,Nashville,36169.195,17.830410,0.301754,0.161306,0.005553,...,0.000,0.000,0.000,520.0,5.0,500.0,1.0,4.0,0.0,matched (3)
150,14800,37,47,5600.0,Nashville,19928.186,22.611578,0.044728,0.045562,0.068393,...,0.000,0.000,0.000,1517.0,25.0,1413.0,3.0,34.0,3.0,matched (3)
151,980100,37,47,5600.0,Nashville,0.000,0.000000,0.000000,0.000000,0.000000,...,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,matched (3)
152,13000,37,47,5600.0,Nashville,126690.850,17.000000,0.041829,0.029767,0.033757,...,0.000,0.000,0.000,3.0,1.0,1.0,0.0,1.0,0.0,matched (3)


In [51]:
den_atlas

Unnamed: 0,tract,county,state,cz,czname,hhinc_mean2000,mean_commutetime2000,frac_coll_plus2010,frac_coll_plus2000,foreign_share2010,...,kfr_white_p25,kfr_white_p75,kfr_white_p100,count_pooled,count_white,count_black,count_asian,count_hisp,count_natam,mergecounts


In [52]:
la_tracts

Unnamed: 0,full_codes,state,county,tract
0,6037262301,60,372,62301
1,6037265100,60,372,65100
2,6037262302,60,372,62302
3,6037265420,60,372,65420
4,6037139801,60,371,39801
...,...,...,...,...
1041,6037980028,60,379,80028
1042,6037265304,60,372,65304
1043,6037222700,60,372,22700
1044,6037980009,60,379,80009


In [64]:
denver_tracts

Unnamed: 0,full_codes,state,county,tract
0,8031003203,80,310,3203
1,8031004303,80,310,4303
2,8031003901,80,310,3901
3,8031006804,80,310,6804
4,8005015100,80,50,15100
...,...,...,...,...
163,8031980000,80,319,80000
164,8059980000,80,599,80000
165,8031001902,80,310,1902
166,8031004107,80,310,4107


In [80]:
atlas[atlas['cz'] == 38300]

Unnamed: 0,tract,county,state,cz,czname,hhinc_mean2000,mean_commutetime2000,frac_coll_plus2010,frac_coll_plus2000,foreign_share2010,...,kfr_white_p25,kfr_white_p75,kfr_white_p100,count_pooled,count_white,count_black,count_asian,count_hisp,count_natam,mergecounts
1449,20100,12,4,38300.0,Los Angeles,41098.555,20.699696,0.144823,0.064516,0.236940,...,,,,508.0,211.0,4.0,0.0,268.0,5.0,matched (3)
1450,20201,12,4,38300.0,Los Angeles,64871.969,17.212221,0.086112,0.098639,0.064184,...,27970.879,46343.273,67287.219,390.0,274.0,4.0,4.0,88.0,13.0,matched (3)
1451,20202,12,4,38300.0,Los Angeles,64871.969,17.212219,0.137230,0.098639,0.121059,...,42376.633,51672.648,60859.031,137.0,97.0,1.0,1.0,31.0,4.0,matched (3)
1452,20501,12,4,38300.0,Los Angeles,41785.777,20.431618,0.131789,0.089991,0.038952,...,,,,100.0,64.0,2.0,0.0,29.0,3.0,matched (3)
1453,20502,12,4,38300.0,Los Angeles,41785.777,20.431618,0.094004,0.089991,0.037028,...,38267.828,74018.602,135305.890,221.0,142.0,5.0,0.0,64.0,6.0,matched (3)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11498,8600,111,6,38300.0,Los Angeles,65869.008,24.791855,0.070215,0.098110,0.471904,...,37016.613,37381.238,37690.672,2557.0,189.0,101.0,94.0,2108.0,12.0,matched (3)
11499,8700,111,6,38300.0,Los Angeles,73408.695,24.716722,0.113213,0.116945,0.361479,...,40518.953,54448.094,69228.234,1843.0,199.0,42.0,19.0,1541.0,1.0,matched (3)
11500,8800,111,6,38300.0,Los Angeles,93613.234,24.779280,0.125901,0.133737,0.278535,...,37699.164,39784.457,41599.840,1736.0,265.0,80.0,64.0,1260.0,1.0,matched (3)
11501,8900,111,6,38300.0,Los Angeles,92239.570,26.036034,0.088484,0.054585,0.304467,...,31811.830,48671.184,67571.695,1057.0,121.0,13.0,31.0,850.0,10.0,matched (3)


In [65]:
nyc_tracts

Unnamed: 0,full_codes,state,county,tract
0,36061012800,36,61,12800
1,36081091601,36,81,91601
2,36081092800,36,81,92800
3,36061015002,36,61,15002
4,36081097300,36,81,97300
...,...,...,...,...
1286,36061031100,36,61,31100
1287,36081060701,36,81,60701
1288,36081061302,36,81,61302
1289,36005024900,36,5,24900


In [69]:
new_la

Unnamed: 0,tract,Name,Household_Income_rP_gP_pall
0,6037262301,"Los Angeles, CA",85044.0
1,6037265100,"Westwood, Los Angeles, CA",81626.0
3,6037262302,"Brentwood, Los Angeles, CA",77935.0
4,6037265420,"Westwood, Los Angeles, CA",76807.0
7,6037139801,"Tarzana, Los Angeles, CA",75027.0
...,...,...,...
3258,6037980028,"Los Angeles, CA",
3260,6037265304,"Westwood, Los Angeles, CA",
3262,6037222700,"South Los Angeles, Los Angeles, CA",
3265,6037980009,"Los Angeles, CA",
