In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf


# clean crime  data

In [2]:
#  Import and read the ca_crime.csv
ca_crime_df = pd.read_excel("resources/california-crime.xls", header=2, skiprows=2)
ca_crime_df.head(20)

Unnamed: 0,City,Population,Violent\ncrime,Murder and\nnonnegligent\nmanslaughter,Rape1,Robbery,Aggravated\nassault,Property\ncrime,Burglary,Larceny-\ntheft,Motor\nvehicle\ntheft,Arson
0,Adelanto,34491.0,276.0,1.0,20.0,42.0,213.0,459.0,136.0,209.0,114.0,14.0
1,Agoura Hills,20490.0,21.0,0.0,6.0,4.0,11.0,306.0,66.0,223.0,17.0,0.0
2,Alameda,78907.0,162.0,0.0,7.0,94.0,61.0,2579.0,218.0,1958.0,403.0,29.0
3,Albany,20083.0,40.0,0.0,8.0,21.0,11.0,685.0,105.0,534.0,46.0,1.0
4,Alhambra,84837.0,161.0,2.0,11.0,89.0,59.0,1749.0,259.0,1303.0,187.0,8.0
5,Aliso Viejo,52247.0,27.0,1.0,3.0,13.0,10.0,433.0,57.0,351.0,25.0,1.0
6,Alturas,2471.0,10.0,0.0,2.0,1.0,7.0,30.0,14.0,13.0,3.0,0.0
7,American Canyon,20452.0,53.0,0.0,7.0,7.0,39.0,454.0,59.0,368.0,27.0,3.0
8,Anaheim,353915.0,1120.0,8.0,141.0,396.0,575.0,8258.0,1123.0,5904.0,1231.0,44.0
9,Anderson,10545.0,61.0,1.0,5.0,12.0,43.0,363.0,63.0,208.0,92.0,1.0


In [3]:
# 2-drop null values
clean_ca_crime_df = ca_crime_df.dropna()
clean_ca_crime_df.head()

Unnamed: 0,City,Population,Violent\ncrime,Murder and\nnonnegligent\nmanslaughter,Rape1,Robbery,Aggravated\nassault,Property\ncrime,Burglary,Larceny-\ntheft,Motor\nvehicle\ntheft,Arson
0,Adelanto,34491.0,276.0,1.0,20.0,42.0,213.0,459.0,136.0,209.0,114.0,14.0
1,Agoura Hills,20490.0,21.0,0.0,6.0,4.0,11.0,306.0,66.0,223.0,17.0,0.0
2,Alameda,78907.0,162.0,0.0,7.0,94.0,61.0,2579.0,218.0,1958.0,403.0,29.0
3,Albany,20083.0,40.0,0.0,8.0,21.0,11.0,685.0,105.0,534.0,46.0,1.0
4,Alhambra,84837.0,161.0,2.0,11.0,89.0,59.0,1749.0,259.0,1303.0,187.0,8.0


In [4]:
# check: list columns with data types
clean_ca_crime_df.dtypes

City                                       object
Population                                float64
Violent\ncrime                            float64
Murder and\nnonnegligent\nmanslaughter    float64
Rape1                                     float64
Robbery                                   float64
Aggravated\nassault                       float64
Property\ncrime                           float64
Burglary                                  float64
Larceny-\ntheft                           float64
Motor\nvehicle\ntheft                     float64
Arson                                     float64
dtype: object

In [5]:
# 3-relabel column headers
# df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
clean_ca_crime_df.rename(columns={'Murder and\nnonnegligent\nmanslaughter':'Murder', 
                           'Violent\ncrime':'ViolentCrime',
                          'Aggravated\nassault':'AggAssault',
                          'Property\ncrime':'PropertyCrime',
                          'Larceny-\ntheft':'Theft',
                          'Motor\nvehicle\ntheft':'VehicleTheft',
                          'Rape1':'Rape'}, inplace=True)
clean_ca_crime_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,City,Population,ViolentCrime,Murder,Rape,Robbery,AggAssault,PropertyCrime,Burglary,Theft,VehicleTheft,Arson
0,Adelanto,34491.0,276.0,1.0,20.0,42.0,213.0,459.0,136.0,209.0,114.0,14.0
1,Agoura Hills,20490.0,21.0,0.0,6.0,4.0,11.0,306.0,66.0,223.0,17.0,0.0
2,Alameda,78907.0,162.0,0.0,7.0,94.0,61.0,2579.0,218.0,1958.0,403.0,29.0
3,Albany,20083.0,40.0,0.0,8.0,21.0,11.0,685.0,105.0,534.0,46.0,1.0
4,Alhambra,84837.0,161.0,2.0,11.0,89.0,59.0,1749.0,259.0,1303.0,187.0,8.0


In [6]:
# check:verify unique values
clean_ca_crime_df.nunique()

City             457
Population       457
ViolentCrime     258
Murder            23
Rape              84
Robbery          150
AggAssault       210
PropertyCrime    406
Burglary         288
Theft            389
VehicleTheft     247
Arson             56
dtype: int64

In [7]:
# check: find null values in DF
# [single column] clean_ca_allCity[clean_ca_allCity['city'].isna()]
clean_ca_crime_df[clean_ca_crime_df.isna().any(axis=1)]

Unnamed: 0,City,Population,ViolentCrime,Murder,Rape,Robbery,AggAssault,PropertyCrime,Burglary,Theft,VehicleTheft,Arson


# clean fire data

In [8]:
# read in fire data file
ca_fire_df = pd.read_csv("resources/ca_daily_fire_2000_2021.csv")
ca_fire_df.head()


Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence,year,month
0,32.436893,-114.597054,2014-06-15,N,VIIRS,100,2014,6
1,32.456921,-114.598595,2014-06-15,N,VIIRS,100,2014,6
2,32.457344,-114.596558,2014-06-15,N,VIIRS,100,2014,6
3,32.458092,-114.592819,2014-06-15,N,VIIRS,100,2014,6
4,32.461948,-114.59388,2014-06-15,N,VIIRS,100,2014,6


In [9]:
# check: list column names with data types
ca_fire_df.dtypes

latitude      float64
longitude     float64
acq_date       object
satellite      object
instrument     object
confidence      int64
year            int64
month           int64
dtype: object

In [10]:
# check: row count for each year / find and count unique values in a column
ca_fire_df.year.value_counts()

2020    277057
2021    197622
2018    119245
2017    101823
2015     66202
2012     59850
2013     58557
2014     51416
2016     47998
2019     31585
2008     18273
2007     13329
2006     12695
2003      9768
2009      7301
2005      7142
2002      6368
2004      5804
2011      4065
2001      3716
2010      2484
2000       349
Name: year, dtype: int64

In [11]:
# 1- drop all ROWS not 2019 (retain all 2019 rows)
clean_ca_fire_df = ca_fire_df[ca_fire_df['year']==2019]
clean_ca_fire_df.head()

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence,year,month
58,32.538414,-114.621254,2019-01-03,N,VIIRS,50,2019,1
65,32.539585,-114.619484,2019-08-12,N,VIIRS,50,2019,8
74,32.541367,-114.618057,2019-06-12,N,VIIRS,50,2019,6
76,32.541977,-114.622284,2019-06-12,N,VIIRS,50,2019,6
78,32.543243,-114.61396,2019-08-12,N,VIIRS,50,2019,8


In [12]:
# 2-relable column headers
# df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
clean_ca_fire_df.rename(columns={'longitude':'lon',
                          'latitude':'lat'}, inplace=True)
clean_ca_fire_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,lat,lon,acq_date,satellite,instrument,confidence,year,month
58,32.538414,-114.621254,2019-01-03,N,VIIRS,50,2019,1
65,32.539585,-114.619484,2019-08-12,N,VIIRS,50,2019,8
74,32.541367,-114.618057,2019-06-12,N,VIIRS,50,2019,6
76,32.541977,-114.622284,2019-06-12,N,VIIRS,50,2019,6
78,32.543243,-114.61396,2019-08-12,N,VIIRS,50,2019,8


In [14]:
# 3-drop any null values
clean_ca_fire_df.dropna()

Unnamed: 0,lat,lon,acq_date,satellite,instrument,confidence,year,month
58,32.538414,-114.621254,2019-01-03,N,VIIRS,50,2019,1
65,32.539585,-114.619484,2019-08-12,N,VIIRS,50,2019,8
74,32.541367,-114.618057,2019-06-12,N,VIIRS,50,2019,6
76,32.541977,-114.622284,2019-06-12,N,VIIRS,50,2019,6
78,32.543243,-114.613960,2019-08-12,N,VIIRS,50,2019,8
...,...,...,...,...,...,...,...,...
1101270,41.990002,-118.648483,2019-12-09,N,VIIRS,50,2019,12
1101419,41.991016,-118.647095,2019-12-09,N,VIIRS,50,2019,12
1101613,41.992332,-121.517357,2019-04-26,N,VIIRS,50,2019,4
1102225,41.996803,-121.732178,2019-11-06,N,VIIRS,50,2019,11


In [19]:
# check: count unique values in each column
clean_ca_fire_df.nunique()

lat           30031
lon           28623
acq_date        361
satellite         3
instrument        2
confidence       51
year              1
month            12
dtype: int64

In [16]:
# check: show unique values in year column
clean_ca_fire_df.year.unique()

array([2019], dtype=int64)

In [17]:
# check: show how many rows for each year / find and count unique values in a column
clean_ca_fire_df.year.value_counts()

2019    31585
Name: year, dtype: int64

In [18]:
# check: find null values in DF
# [single column] clean_ca_allCity[clean_ca_allCity['city'].isna()]
clean_ca_fire_df[clean_ca_fire_df.isna().any(axis=1)]

Unnamed: 0,lat,lon,acq_date,satellite,instrument,confidence,year,month


# clean housing  data (ZILLOW)

In [20]:
ca_zip_df = pd.read_csv("resources/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
ca_zip_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31,...,2021-12-31,2022-01-31,2022-02-28,2022-03-31,2022-04-30,2022-05-31,2022-06-30,2022-07-31,2022-08-31,2022-09-30
0,91940,0,77449,zip,TX,TX,,"Houston-The Woodlands-Sugar Land, TX",Harris County,106672.0,...,251930.0,255861.0,260506.0,266174.0,272442.0,278528.0,283555.0,286948.0,288863.0,289504.0
1,91982,1,77494,zip,TX,TX,,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,223357.0,...,439209.0,445272.0,454755.0,466373.0,480063.0,491447.0,498934.0,501198.0,499334.0,499877.0
2,93144,2,79936,zip,TX,TX,El Paso,"El Paso, TX",El Paso County,90482.0,...,174265.0,176277.0,178252.0,180407.0,183773.0,187384.0,190834.0,193074.0,194779.0,196166.0
3,62080,3,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,187749.0,...,571191.0,569363.0,569160.0,571091.0,572605.0,575407.0,577640.0,579764.0,580707.0,583489.0
4,62093,4,11385,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,275753.0,...,744928.0,748113.0,751397.0,757101.0,759701.0,765523.0,770799.0,775959.0,778630.0,781713.0


In [21]:
# check: list column names with data types
list(ca_zip_df.columns)

['RegionID',
 'SizeRank',
 'RegionName',
 'RegionType',
 'StateName',
 'State',
 'City',
 'Metro',
 'CountyName',
 '2000-01-31',
 '2000-02-29',
 '2000-03-31',
 '2000-04-30',
 '2000-05-31',
 '2000-06-30',
 '2000-07-31',
 '2000-08-31',
 '2000-09-30',
 '2000-10-31',
 '2000-11-30',
 '2000-12-31',
 '2001-01-31',
 '2001-02-28',
 '2001-03-31',
 '2001-04-30',
 '2001-05-31',
 '2001-06-30',
 '2001-07-31',
 '2001-08-31',
 '2001-09-30',
 '2001-10-31',
 '2001-11-30',
 '2001-12-31',
 '2002-01-31',
 '2002-02-28',
 '2002-03-31',
 '2002-04-30',
 '2002-05-31',
 '2002-06-30',
 '2002-07-31',
 '2002-08-31',
 '2002-09-30',
 '2002-10-31',
 '2002-11-30',
 '2002-12-31',
 '2003-01-31',
 '2003-02-28',
 '2003-03-31',
 '2003-04-30',
 '2003-05-31',
 '2003-06-30',
 '2003-07-31',
 '2003-08-31',
 '2003-09-30',
 '2003-10-31',
 '2003-11-30',
 '2003-12-31',
 '2004-01-31',
 '2004-02-29',
 '2004-03-31',
 '2004-04-30',
 '2004-05-31',
 '2004-06-30',
 '2004-07-31',
 '2004-08-31',
 '2004-09-30',
 '2004-10-31',
 '2004-11-30',
 

In [22]:
# 1-remove excess columns
# 2000-2018
ca_zip_df.drop(['RegionID',
 'SizeRank',
 'RegionType',
 'StateName',
 '2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30', '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
 '2000-09-30', '2000-10-31', '2000-11-30', '2000-12-31', '2001-01-31', '2001-02-28', '2001-03-31', '2001-04-30',
 '2001-05-31', '2001-06-30', '2001-07-31', '2001-08-31', '2001-09-30', '2001-10-31', '2001-11-30', '2001-12-31',
 '2002-01-31', '2002-02-28', '2002-03-31', '2002-04-30', '2002-05-31', '2002-06-30', '2002-07-31', '2002-08-31',
 '2002-09-30', '2002-10-31', '2002-11-30', '2002-12-31', '2003-01-31', '2003-02-28', '2003-03-31', '2003-04-30',
 '2003-05-31', '2003-06-30', '2003-07-31', '2003-08-31', '2003-09-30', '2003-10-31', '2003-11-30', '2003-12-31',
 '2004-01-31', '2004-02-29', '2004-03-31', '2004-04-30', '2004-05-31', '2004-06-30', '2004-07-31', '2004-08-31',
 '2004-09-30', '2004-10-31', '2004-11-30', '2004-12-31', '2005-01-31', '2005-02-28', '2005-03-31', '2005-04-30',
 '2005-05-31', '2005-06-30', '2005-07-31', '2005-08-31', '2005-09-30', '2005-10-31', '2005-11-30', '2005-12-31',
 '2006-01-31', '2006-02-28', '2006-03-31', '2006-04-30', '2006-05-31', '2006-06-30', '2006-07-31', '2006-08-31',
 '2006-09-30', '2006-10-31', '2006-11-30', '2006-12-31', '2007-01-31', '2007-02-28', '2007-03-31', '2007-04-30',
 '2007-05-31', '2007-06-30', '2007-07-31', '2007-08-31', '2007-09-30', '2007-10-31', '2007-11-30', '2007-12-31',
 '2008-01-31', '2008-02-29', '2008-03-31', '2008-04-30', '2008-05-31', '2008-06-30', '2008-07-31', '2008-08-31',
 '2008-09-30', '2008-10-31', '2008-11-30', '2008-12-31', '2009-01-31', '2009-02-28', '2009-03-31', '2009-04-30',
 '2009-05-31', '2009-06-30', '2009-07-31', '2009-08-31', '2009-09-30', '2009-10-31', '2009-11-30', '2009-12-31',
 '2010-01-31', '2010-02-28', '2010-03-31', '2010-04-30', '2010-05-31', '2010-06-30', '2010-07-31', '2010-08-31',
 '2010-09-30', '2010-10-31', '2010-11-30', '2010-12-31', '2011-01-31', '2011-02-28', '2011-03-31', '2011-04-30',
 '2011-05-31', '2011-06-30', '2011-07-31', '2011-08-31', '2011-09-30', '2011-10-31', '2011-11-30', '2011-12-31',
 '2012-01-31', '2012-02-29', '2012-03-31', '2012-04-30', '2012-05-31', '2012-06-30', '2012-07-31', '2012-08-31',
 '2012-09-30', '2012-10-31', '2012-11-30', '2012-12-31', '2013-01-31', '2013-02-28', '2013-03-31', '2013-04-30',
 '2013-05-31', '2013-06-30', '2013-07-31', '2013-08-31', '2013-09-30', '2013-10-31', '2013-11-30', '2013-12-31',
 '2014-01-31', '2014-02-28', '2014-03-31', '2014-04-30', '2014-05-31', '2014-06-30', '2014-07-31', '2014-08-31',
 '2014-09-30', '2014-10-31', '2014-11-30', '2014-12-31', '2015-01-31', '2015-02-28', '2015-03-31', '2015-04-30',
 '2015-05-31', '2015-06-30', '2015-07-31', '2015-08-31', '2015-09-30', '2015-10-31', '2015-11-30', '2015-12-31',
 '2016-01-31', '2016-02-29', '2016-03-31', '2016-04-30', '2016-05-31', '2016-06-30', '2016-07-31', '2016-08-31',
 '2016-09-30', '2016-10-31', '2016-11-30', '2016-12-31', '2017-01-31', '2017-02-28', '2017-03-31', '2017-04-30',
 '2017-05-31', '2017-06-30', '2017-07-31', '2017-08-31', '2017-09-30', '2017-10-31', '2017-11-30', '2017-12-31',
 '2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31', '2018-06-30', '2018-07-31', '2018-08-31',
 '2018-09-30', '2018-10-31', '2018-11-30', '2018-12-31', '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
 '2020-05-31', '2020-06-30', '2020-07-31', '2020-08-31', '2020-09-30', '2020-10-31', '2020-11-30', '2020-12-31',
 '2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30', '2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
 '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31', '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
 '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31', '2022-09-30']
   ,axis=1, inplace=True)
ca_zip_df.head()

Unnamed: 0,RegionName,State,City,Metro,CountyName,2019-01-31,2019-02-28,2019-03-31,2019-04-30,2019-05-31,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31
0,77449,TX,,"Houston-The Woodlands-Sugar Land, TX",Harris County,184680.0,185344.0,185769.0,185704.0,185871.0,186308.0,186958.0,187546.0,187862.0,188591.0,189832.0,190969.0
1,77494,TX,,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,337941.0,337479.0,337336.0,336335.0,335652.0,335095.0,335339.0,335456.0,336243.0,336648.0,336984.0,336701.0
2,79936,TX,El Paso,"El Paso, TX",El Paso County,129659.0,130372.0,131305.0,132302.0,133412.0,134491.0,135268.0,135795.0,135244.0,134448.0,133972.0,134302.0
3,11368,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,568912.0,571854.0,574314.0,575327.0,574791.0,573311.0,572617.0,571983.0,567669.0,565175.0,563859.0,571199.0
4,11385,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,707240.0,707848.0,707917.0,707800.0,706380.0,705865.0,704499.0,703013.0,700075.0,699812.0,699064.0,699828.0


In [23]:
# 1-drop all ROWS not CA (retain all CA rows)
clean_ca_housing = ca_zip_df[ca_zip_df['State']=='CA']
clean_ca_housing.head()

Unnamed: 0,RegionName,State,City,Metro,CountyName,2019-01-31,2019-02-28,2019-03-31,2019-04-30,2019-05-31,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31
5,90011,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,431284.0,431834.0,431883.0,434223.0,437189.0,439420.0,437700.0,437288.0,438655.0,443029.0,445805.0,450115.0
8,91331,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,493023.0,491204.0,489511.0,489494.0,489850.0,491273.0,489745.0,490367.0,491787.0,496455.0,500367.0,506064.0
9,90650,CA,Norwalk,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,487572.0,485517.0,484164.0,484345.0,485361.0,486121.0,484453.0,484143.0,485745.0,489355.0,492048.0,495037.0
12,90201,CA,Bell,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,451985.0,451959.0,452378.0,452891.0,452835.0,453393.0,452322.0,452175.0,453846.0,457415.0,460570.0,463997.0
13,92335,CA,Fontana,"Riverside-San Bernardino-Ontario, CA",San Bernardino County,350245.0,351116.0,352676.0,354228.0,356071.0,357008.0,356722.0,357487.0,358679.0,360668.0,361821.0,362922.0


In [24]:
# 1-drop rows with NULLS
clean_ca_housing = clean_ca_housing.dropna()
clean_ca_housing.head()

Unnamed: 0,RegionName,State,City,Metro,CountyName,2019-01-31,2019-02-28,2019-03-31,2019-04-30,2019-05-31,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31
5,90011,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,431284.0,431834.0,431883.0,434223.0,437189.0,439420.0,437700.0,437288.0,438655.0,443029.0,445805.0,450115.0
8,91331,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,493023.0,491204.0,489511.0,489494.0,489850.0,491273.0,489745.0,490367.0,491787.0,496455.0,500367.0,506064.0
9,90650,CA,Norwalk,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,487572.0,485517.0,484164.0,484345.0,485361.0,486121.0,484453.0,484143.0,485745.0,489355.0,492048.0,495037.0
12,90201,CA,Bell,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,451985.0,451959.0,452378.0,452891.0,452835.0,453393.0,452322.0,452175.0,453846.0,457415.0,460570.0,463997.0
13,92335,CA,Fontana,"Riverside-San Bernardino-Ontario, CA",San Bernardino County,350245.0,351116.0,352676.0,354228.0,356071.0,357008.0,356722.0,357487.0,358679.0,360668.0,361821.0,362922.0


In [25]:
# check: list column names with data types
clean_ca_housing.dtypes

RegionName      int64
State          object
City           object
Metro          object
CountyName     object
2019-01-31    float64
2019-02-28    float64
2019-03-31    float64
2019-04-30    float64
2019-05-31    float64
2019-06-30    float64
2019-07-31    float64
2019-08-31    float64
2019-09-30    float64
2019-10-31    float64
2019-11-30    float64
2019-12-31    float64
dtype: object

In [26]:
# 2- relable column headers
# df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
clean_ca_housing.rename(columns={'RegionName':'ZIPCODE'}, inplace=True)
clean_ca_housing.head()

Unnamed: 0,ZIPCODE,State,City,Metro,CountyName,2019-01-31,2019-02-28,2019-03-31,2019-04-30,2019-05-31,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31
5,90011,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,431284.0,431834.0,431883.0,434223.0,437189.0,439420.0,437700.0,437288.0,438655.0,443029.0,445805.0,450115.0
8,91331,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,493023.0,491204.0,489511.0,489494.0,489850.0,491273.0,489745.0,490367.0,491787.0,496455.0,500367.0,506064.0
9,90650,CA,Norwalk,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,487572.0,485517.0,484164.0,484345.0,485361.0,486121.0,484453.0,484143.0,485745.0,489355.0,492048.0,495037.0
12,90201,CA,Bell,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,451985.0,451959.0,452378.0,452891.0,452835.0,453393.0,452322.0,452175.0,453846.0,457415.0,460570.0,463997.0
13,92335,CA,Fontana,"Riverside-San Bernardino-Ontario, CA",San Bernardino County,350245.0,351116.0,352676.0,354228.0,356071.0,357008.0,356722.0,357487.0,358679.0,360668.0,361821.0,362922.0


In [27]:
# 3-drop any null values
clean_ca_housing.dropna()

Unnamed: 0,ZIPCODE,State,City,Metro,CountyName,2019-01-31,2019-02-28,2019-03-31,2019-04-30,2019-05-31,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31
5,90011,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,431284.0,431834.0,431883.0,434223.0,437189.0,439420.0,437700.0,437288.0,438655.0,443029.0,445805.0,450115.0
8,91331,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,493023.0,491204.0,489511.0,489494.0,489850.0,491273.0,489745.0,490367.0,491787.0,496455.0,500367.0,506064.0
9,90650,CA,Norwalk,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,487572.0,485517.0,484164.0,484345.0,485361.0,486121.0,484453.0,484143.0,485745.0,489355.0,492048.0,495037.0
12,90201,CA,Bell,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,451985.0,451959.0,452378.0,452891.0,452835.0,453393.0,452322.0,452175.0,453846.0,457415.0,460570.0,463997.0
13,92335,CA,Fontana,"Riverside-San Bernardino-Ontario, CA",San Bernardino County,350245.0,351116.0,352676.0,354228.0,356071.0,357008.0,356722.0,357487.0,358679.0,360668.0,361821.0,362922.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27218,95728,CA,Soda Springs,"Truckee-Grass Valley, CA",Nevada County,557324.0,555721.0,555257.0,561994.0,560237.0,557938.0,552008.0,557406.0,559300.0,559485.0,554730.0,555071.0
27242,95433,CA,Sonoma,"Santa Rosa-Petaluma, CA",Sonoma County,609324.0,603717.0,603966.0,604240.0,610094.0,615195.0,622122.0,627884.0,629844.0,630172.0,625626.0,622389.0
27245,95553,CA,Myers Flat,"Eureka-Arcata, CA",Humboldt County,402804.0,393794.0,384356.0,385227.0,384046.0,384725.0,378135.0,371555.0,366332.0,363749.0,362071.0,357493.0
27288,96084,CA,Round Mountain,"Redding, CA",Shasta County,219284.0,218838.0,218146.0,217844.0,220667.0,227675.0,232549.0,235531.0,235701.0,236305.0,237803.0,238989.0


In [28]:
# check: count unique values in each column
clean_ca_housing.nunique()

ZIPCODE       1399
State            1
City           780
Metro           34
CountyName      45
2019-01-31    1399
2019-02-28    1398
2019-03-31    1398
2019-04-30    1398
2019-05-31    1399
2019-06-30    1399
2019-07-31    1399
2019-08-31    1399
2019-09-30    1399
2019-10-31    1399
2019-11-30    1398
2019-12-31    1398
dtype: int64

In [29]:
# check: find null values in DF
# [single column] clean_ca_allCity[clean_ca_allCity['city'].isna()]
clean_ca_housing[clean_ca_housing.isna().any(axis=1)]

Unnamed: 0,ZIPCODE,State,City,Metro,CountyName,2019-01-31,2019-02-28,2019-03-31,2019-04-30,2019-05-31,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31


# All city data

In [30]:
#  Import and read the ca_crime.csv

ca_allCity = pd.read_csv("resources/ca.csv", low_memory=False)
ca_allCity.head(20)

Unnamed: 0,LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
0,-122.271511,37.800795,988,FRANKLIN ST,508,OAKLAND,,,94607,2-59-30,ed2b5c3f97487dbd
1,-122.271511,37.800795,988,FRANKLIN ST,1301,OAKLAND,,,94607,2-59-30,264e4fd141c7296c
2,-122.271511,37.800795,988,FRANKLIN ST,1302,OAKLAND,,,94607,2-59-30,a2a17edf7c47595d
3,-122.271511,37.800795,988,FRANKLIN ST,507,OAKLAND,,,94607,2-59-30,95f1905107f315fe
4,-122.271623,37.800613,928,FRANKLIN ST,522,OAKLAND,,,94607,2-59-30,2f27ac0218bd57e3
5,-122.271511,37.800795,988,FRANKLIN ST,1107,OAKLAND,,,94607,2-59-30,b020dd1eae9c6fe7
6,-122.271511,37.800795,988,FRANKLIN ST,1108,OAKLAND,,,94607,2-59-30,16535d22c3dffafa
7,-122.271511,37.800795,988,FRANKLIN ST,521,OAKLAND,,,94607,2-59-30,633b11f311bc010d
8,-122.27307,37.79961,763,FRANKLIN ST,214,OAKLAND,,,94607,1-229-4,1459faf8cc9cc6bb
9,-122.27307,37.79961,409,8TH ST,A,OAKLAND,,,94607,1-229-4,5a70b5e19c2ae7a6


In [31]:
# 1- remove excess columns
ca_allCity.drop(['DISTRICT', 'REGION', 'HASH', 'ID', 'UNIT'],axis=1, inplace=True)
ca_allCity.head()

Unnamed: 0,LON,LAT,NUMBER,STREET,CITY,POSTCODE
0,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
1,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
2,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
3,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
4,-122.271623,37.800613,928,FRANKLIN ST,OAKLAND,94607


In [32]:
# 2- relable column headers
# df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
ca_allCity.rename(columns={'LON':'lon', 
                          'LAT':'lat', 
                          'NUMBER':'number', 
                           'STREET':'street', 
                           'CITY':'city', 
                           'POSTCODE':'zipcode', 
                          }, inplace=True)
ca_allCity.head()

Unnamed: 0,lon,lat,number,street,city,zipcode
0,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
1,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
2,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
3,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
4,-122.271623,37.800613,928,FRANKLIN ST,OAKLAND,94607


In [33]:
# 3- drop duplicate rows
clean_allcity_df = ca_allCity.drop_duplicates()
clean_allcity_df.head()

Unnamed: 0,lon,lat,number,street,city,zipcode
0,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
4,-122.271623,37.800613,928,FRANKLIN ST,OAKLAND,94607
8,-122.27307,37.79961,763,FRANKLIN ST,OAKLAND,94607
9,-122.27307,37.79961,409,8TH ST,OAKLAND,94607
17,-122.271971,37.79922,377,8TH ST,OAKLAND,94607


In [34]:
# check: list column names with data types
clean_allcity_df.dtypes

lon        float64
lat        float64
number      object
street      object
city        object
zipcode     object
dtype: object

In [35]:
# 3-reorder columns
clean_allcity_df.loc[:,['lat', 'lon', 'number', 'street', 'city', 'zipcode' ]]

Unnamed: 0,lat,lon,number,street,city,zipcode
0,37.800795,-122.271511,988,FRANKLIN ST,OAKLAND,94607
4,37.800613,-122.271623,928,FRANKLIN ST,OAKLAND,94607
8,37.799610,-122.273070,763,FRANKLIN ST,OAKLAND,94607
9,37.799610,-122.273070,409,8TH ST,OAKLAND,94607
17,37.799220,-122.271971,377,8TH ST,OAKLAND,94607
...,...,...,...,...,...,...
13784508,39.005501,-121.549226,1351,HIGH NOON DR,Marysville,
13784509,39.007747,-121.546307,1396,HIGH NOON DR,Marysville,
13784510,39.145824,-121.586345,921,B,Marysville,
13784511,39.558356,-121.123128,13060,LA PORTE RD,Strawberry Valley,


In [39]:
# 4-drop nulls
clean_ca_allCity = clean_allcity_df.dropna()
clean_ca_allCity.head()

Unnamed: 0,lon,lat,number,street,city,zipcode
0,-122.271511,37.800795,988,FRANKLIN ST,OAKLAND,94607
4,-122.271623,37.800613,928,FRANKLIN ST,OAKLAND,94607
8,-122.27307,37.79961,763,FRANKLIN ST,OAKLAND,94607
9,-122.27307,37.79961,409,8TH ST,OAKLAND,94607
17,-122.271971,37.79922,377,8TH ST,OAKLAND,94607


In [40]:
# check: count unique values in each column
clean_ca_allCity.nunique()

lon        6789928
lat        6456470
number      103047
street      220222
city          1061
zipcode      22148
dtype: int64

In [41]:
# check: find null values in DF
# [single column] clean_ca_allCity[clean_ca_allCity['city'].isna()]
clean_ca_allCity[clean_ca_allCity.isna().any(axis=1)]

Unnamed: 0,lon,lat,number,street,city,zipcode


# export clean datafiles for SQL trasfer

In [None]:
# export allCitiy
clean_ca_allCity.to_csv('resources/clean_allCity.csv') 

In [None]:
# export housing
clean_ca_housing.to_csv('resources/clean_ca_housing.csv') 

In [None]:
# export crime
clean_ca_crime_df.to_csv('resources/clean_ca_crime.csv') 

In [None]:
# export fire
clean_ca_fire_df.to_csv('resources/clean_fire.csv') 