In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf



# clean crime  data

In [None]:
#  Import and read the ca_crime.csv

ca_crime_df = pd.read_excel('../Resources/california-crime.xls", header=2, skiprows=2)
ca_crime_df.head(20)

In [None]:
clean_ca_crime_df = ca_crime_df.dropna()
clean_ca_crime_df.head()

In [None]:
clean_ca_crime_df.dtypes

In [None]:
# drop any null values
clean_ca_crime_df.dropna()

In [None]:
# change lables 
# df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
clean_ca_crime_df.rename(columns={'Murder and\nnonnegligent\nmanslaughter':'Murder', 
                           'Violent\ncrime':'ViolentCrime',
                          'Aggravated\nassault':'AggAssault',
                          'Property\ncrime':'PropertyCrime',
                          'Larceny-\ntheft':'Theft',
                          'Motor\nvehicle\ntheft':'VehicleTheft',
                          'Rape1':'Rape'}, inplace=True)
clean_ca_crime_df.head()

In [None]:
# verify unique values
clean_ca_crime_df.nunique()

# clean fire data

In [2]:
# read in fire data file
ca_fire_df = pd.read_csv("../Resources/ca_daily_fire_2000_2021.csv")
ca_fire_df.head()


Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence,year,month
0,32.436893,-114.597054,2014-06-15,N,VIIRS,100,2014,6
1,32.456921,-114.598595,2014-06-15,N,VIIRS,100,2014,6
2,32.457344,-114.596558,2014-06-15,N,VIIRS,100,2014,6
3,32.458092,-114.592819,2014-06-15,N,VIIRS,100,2014,6
4,32.461948,-114.59388,2014-06-15,N,VIIRS,100,2014,6


In [3]:
# list column names with data types
ca_fire_df.dtypes

latitude      float64
longitude     float64
acq_date       object
satellite      object
instrument     object
confidence      int64
year            int64
month           int64
dtype: object

In [4]:
# drop any null values
ca_fire_df.dropna()

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence,year,month
0,32.436893,-114.597054,2014-06-15,N,VIIRS,100,2014,6
1,32.456921,-114.598595,2014-06-15,N,VIIRS,100,2014,6
2,32.457344,-114.596558,2014-06-15,N,VIIRS,100,2014,6
3,32.458092,-114.592819,2014-06-15,N,VIIRS,100,2014,6
4,32.461948,-114.593880,2014-06-15,N,VIIRS,100,2014,6
...,...,...,...,...,...,...,...,...
1102644,41.999947,-120.655807,2012-08-14,N,VIIRS,50,2012,8
1102645,41.999950,-123.621155,2020-09-20,N,VIIRS,50,2020,9
1102646,41.999958,-122.592186,2018-07-08,N,VIIRS,50,2018,7
1102647,41.999969,-120.648338,2012-08-14,N,VIIRS,50,2012,8


In [5]:
# change lables 
# df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
ca_fire_df.rename(columns={'longitude':'lon',
                          'latitude':'lat'}, inplace=True)
ca_fire_df.head()

Unnamed: 0,lat,lon,acq_date,satellite,instrument,confidence,year,month
0,32.436893,-114.597054,2014-06-15,N,VIIRS,100,2014,6
1,32.456921,-114.598595,2014-06-15,N,VIIRS,100,2014,6
2,32.457344,-114.596558,2014-06-15,N,VIIRS,100,2014,6
3,32.458092,-114.592819,2014-06-15,N,VIIRS,100,2014,6
4,32.461948,-114.59388,2014-06-15,N,VIIRS,100,2014,6


In [6]:
# count unique values in each column
ca_fire_df.nunique()

lat           721017
lon           456440
acq_date        7091
satellite          3
instrument         2
confidence        51
year              22
month             12
dtype: int64

In [7]:
# show unique values in year column
ca_fire_df.year.unique()

array([2014, 2011, 2013, 2012, 2010, 2009, 2004, 2002, 2018, 2007, 2005,
       2021, 2015, 2008, 2020, 2019, 2016, 2017, 2006, 2003, 2001, 2000],
      dtype=int64)

In [8]:
# show how many rows for each year / find and count unique values in a column
ca_fire_df.year.value_counts()

2020    277057
2021    197622
2018    119245
2017    101823
2015     66202
2012     59850
2013     58557
2014     51416
2016     47998
2019     31585
2008     18273
2007     13329
2006     12695
2003      9768
2009      7301
2005      7142
2002      6368
2004      5804
2011      4065
2001      3716
2010      2484
2000       349
Name: year, dtype: int64

In [9]:
# drop all ROWS not 2019 (retain all 2019 rows)
clean_ca_fire_df = ca_fire_df[ca_fire_df['year']==2019]
clean_ca_fire_df.head()

Unnamed: 0,lat,lon,acq_date,satellite,instrument,confidence,year,month
58,32.538414,-114.621254,2019-01-03,N,VIIRS,50,2019,1
65,32.539585,-114.619484,2019-08-12,N,VIIRS,50,2019,8
74,32.541367,-114.618057,2019-06-12,N,VIIRS,50,2019,6
76,32.541977,-114.622284,2019-06-12,N,VIIRS,50,2019,6
78,32.543243,-114.61396,2019-08-12,N,VIIRS,50,2019,8


# clean housing  data (ZILLOW)

In [None]:
ca_zip_df = pd.read_csv("resources/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
ca_zip_df.head()

In [None]:
# list column names with data types
list(ca_zip_df.columns)

In [None]:
# remove excess date columns 2000-2018
ca_zip_df.drop(['RegionID',
 'SizeRank',
 'RegionType',
 'StateName',
 '2000-01-31',
 '2000-02-29',
 '2000-03-31',
 '2000-04-30',
 '2000-05-31',
 '2000-06-30',
 '2000-07-31',
 '2000-08-31',
 '2000-09-30',
 '2000-10-31',
 '2000-11-30',
 '2000-12-31',
 '2001-01-31',
 '2001-02-28',
 '2001-03-31',
 '2001-04-30',
 '2001-05-31',
 '2001-06-30',
 '2001-07-31',
 '2001-08-31',
 '2001-09-30',
 '2001-10-31',
 '2001-11-30',
 '2001-12-31',
 '2002-01-31',
 '2002-02-28',
 '2002-03-31',
 '2002-04-30',
 '2002-05-31',
 '2002-06-30',
 '2002-07-31',
 '2002-08-31',
 '2002-09-30',
 '2002-10-31',
 '2002-11-30',
 '2002-12-31',
 '2003-01-31',
 '2003-02-28',
 '2003-03-31',
 '2003-04-30',
 '2003-05-31',
 '2003-06-30',
 '2003-07-31',
 '2003-08-31',
 '2003-09-30',
 '2003-10-31',
 '2003-11-30',
 '2003-12-31',
 '2004-01-31',
 '2004-02-29',
 '2004-03-31',
 '2004-04-30',
 '2004-05-31',
 '2004-06-30',
 '2004-07-31',
 '2004-08-31',
 '2004-09-30',
 '2004-10-31',
 '2004-11-30',
 '2004-12-31',
 '2005-01-31',
 '2005-02-28',
 '2005-03-31',
 '2005-04-30',
 '2005-05-31',
 '2005-06-30',
 '2005-07-31',
 '2005-08-31',
 '2005-09-30',
 '2005-10-31',
 '2005-11-30',
 '2005-12-31',
 '2006-01-31',
 '2006-02-28',
 '2006-03-31',
 '2006-04-30',
 '2006-05-31',
 '2006-06-30',
 '2006-07-31',
 '2006-08-31',
 '2006-09-30',
 '2006-10-31',
 '2006-11-30',
 '2006-12-31',
 '2007-01-31',
 '2007-02-28',
 '2007-03-31',
 '2007-04-30',
 '2007-05-31',
 '2007-06-30',
 '2007-07-31',
 '2007-08-31',
 '2007-09-30',
 '2007-10-31',
 '2007-11-30',
 '2007-12-31',
 '2008-01-31',
 '2008-02-29',
 '2008-03-31',
 '2008-04-30',
 '2008-05-31',
 '2008-06-30',
 '2008-07-31',
 '2008-08-31',
 '2008-09-30',
 '2008-10-31',
 '2008-11-30',
 '2008-12-31',
 '2009-01-31',
 '2009-02-28',
 '2009-03-31',
 '2009-04-30',
 '2009-05-31',
 '2009-06-30',
 '2009-07-31',
 '2009-08-31',
 '2009-09-30',
 '2009-10-31',
 '2009-11-30',
 '2009-12-31',
 '2010-01-31',
 '2010-02-28',
 '2010-03-31',
 '2010-04-30',
 '2010-05-31',
 '2010-06-30',
 '2010-07-31',
 '2010-08-31',
 '2010-09-30',
 '2010-10-31',
 '2010-11-30',
 '2010-12-31',
 '2011-01-31',
 '2011-02-28',
 '2011-03-31',
 '2011-04-30',
 '2011-05-31',
 '2011-06-30',
 '2011-07-31',
 '2011-08-31',
 '2011-09-30',
 '2011-10-31',
 '2011-11-30',
 '2011-12-31',
 '2012-01-31',
 '2012-02-29',
 '2012-03-31',
 '2012-04-30',
 '2012-05-31',
 '2012-06-30',
 '2012-07-31',
 '2012-08-31',
 '2012-09-30',
 '2012-10-31',
 '2012-11-30',
 '2012-12-31',
 '2013-01-31',
 '2013-02-28',
 '2013-03-31',
 '2013-04-30',
 '2013-05-31',
 '2013-06-30',
 '2013-07-31',
 '2013-08-31',
 '2013-09-30',
 '2013-10-31',
 '2013-11-30',
 '2013-12-31',
 '2014-01-31',
 '2014-02-28',
 '2014-03-31',
 '2014-04-30',
 '2014-05-31',
 '2014-06-30',
 '2014-07-31',
 '2014-08-31',
 '2014-09-30',
 '2014-10-31',
 '2014-11-30',
 '2014-12-31',
 '2015-01-31',
 '2015-02-28',
 '2015-03-31',
 '2015-04-30',
 '2015-05-31',
 '2015-06-30',
 '2015-07-31',
 '2015-08-31',
 '2015-09-30',
 '2015-10-31',
 '2015-11-30',
 '2015-12-31',
 '2016-01-31',
 '2016-02-29',
 '2016-03-31',
 '2016-04-30',
 '2016-05-31',
 '2016-06-30',
 '2016-07-31',
 '2016-08-31',
 '2016-09-30',
 '2016-10-31',
 '2016-11-30',
 '2016-12-31',
 '2017-01-31',
 '2017-02-28',
 '2017-03-31',
 '2017-04-30',
 '2017-05-31',
 '2017-06-30',
 '2017-07-31',
 '2017-08-31',
 '2017-09-30',
 '2017-10-31',
 '2017-11-30',
 '2017-12-31',
 '2018-01-31',
 '2018-02-28',
 '2018-03-31',
 '2018-04-30',
 '2018-05-31',
 '2018-06-30',
 '2018-07-31',
 '2018-08-31',
 '2018-09-30',
 '2018-10-31',
 '2018-11-30',
 '2018-12-31',
'2020-01-31',
 '2020-02-29',
 '2020-03-31',
 '2020-04-30',
 '2020-05-31',
 '2020-06-30',
 '2020-07-31',
 '2020-08-31',
 '2020-09-30',
 '2020-10-31',
 '2020-11-30',
 '2020-12-31',
 '2021-01-31',
 '2021-02-28',
 '2021-03-31',
 '2021-04-30',
 '2021-05-31',
 '2021-06-30',
 '2021-07-31',
 '2021-08-31',
 '2021-09-30',
 '2021-10-31',
 '2021-11-30',
 '2021-12-31',
 '2022-01-31',
 '2022-02-28',
 '2022-03-31',
 '2022-04-30',
 '2022-05-31',
 '2022-06-30',
 '2022-07-31',
 '2022-08-31',
 '2022-09-30'],axis=1, inplace=True)
ca_zip_df.head()

In [None]:
# drop all ROWS not CA (retain all CA rows)
ca_zip_df_filter = ca_zip_df[ca_zip_df['State']=='CA']
ca_zip_df_filter.head()

In [None]:
# list column names with data types
ca_zip_df_filter.dtypes

In [None]:
# drop any null values
ca_zip_df_filter.dropna()

In [None]:
# change column lables 
# df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
ca_zip_df_filter.rename(columns={'RegionName':'ZIPCODE'}, inplace=True)
ca_zip_df_filter.head()

In [None]:
# count unique values in each column
ca_zip_df_filter.nunique()

In [None]:
# count unique values in each column
ca_zip_df.nunique()

# All city data

In [None]:
#  Import and read the ca_crime.csv

ca_allCity = pd.read_csv("resources/ca.csv", low_memory=False)
ca_allCity.head(20)

In [None]:
# remove excess date columns
ca_allCity.drop(['DISTRICT', 'REGION', 'HASH', 'ID'],axis=1, inplace=True)
ca_allCity.head()

In [None]:
# change lables 
# df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
ca_allCity.rename(columns={'LON':'lon', 
                          'LAT':'lat', 
                          'NUMBER':'number', 
                           'STREET':'street', 
                           'UNIT':'unit', 
                           'CITY':'city', 
                           'POSTCODE':'zipcode', 
                          }, inplace=True)
ca_allCity.head()

In [None]:
# count unique values in each column
ca_allCity.nunique()

In [None]:
# list column names with data types
ca_allCity.dtypes

In [None]:
# reorder columns
ca_allCity.loc[:,['lat', 'lon', 'number', 'street', 'unit', 'city', 'zipcode' ]]

# export clean datafiles for SQL trasfer

In [None]:
# export allCitiy
ca_allCity.to_csv('resources/clean_allCity.csv') 

In [None]:
# export housing
ca_zip_df_filter.to_csv('resources/clean_ca_housing.csv') 

In [None]:
# export crime
clean_ca_crime_df.to_csv('resources/clean_ca_crime.csv') 

In [11]:
# export fire
clean_ca_fire_df.to_csv('../Resources/clean_fire.csv') 