In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import os

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

In [32]:
from sklearn import preprocessing

Benjamin: cumulative_enrollment, stats_sd

Andrew: pubschools_info

Alex: border_crossings

In [3]:
border_crossings_path = os.path.join('Datasets', 'border_crossings.csv')
border_crossings = pd.read_csv(border_crossings_path, low_memory=False)

cumulative_enrollment_path = os.path.join('Datasets', 'cumulative_enrollment.csv')
cumulative_enrollment = pd.read_csv(cumulative_enrollment_path, low_memory=False)

pubschools_info_path = os.path.join('Datasets', 'pubschools_info.csv')
pubschools_info = pd.read_csv(pubschools_info_path, low_memory=False)

stats_sd_path = os.path.join('Datasets', 'COVID_19_Statistics_San_Diego_County.csv')
stats_sd = pd.read_csv(stats_sd_path, low_memory=False)

#I'm having a bit of trouble loading this one in for some reason, I'll check it out later.
age_stats_sd_path = os.path.join('Datasets', 'COVID-19_Age_Statistics_by_ZIP_Code')
age_stats_sd = pd.read_csv(age_stats_sd_path, low_memory=False)

### Checklist:
    1. Get dtypes in order
        change dates to object datetime
        natural numbers to int
        decimals to float
    2. Remove columns that are not relevant
    3. Remove the unneeded symbols from string data
    4. Replace null/blank/missing values with NaN
    5. Look for and understand suspicious or out of place values for each column
    6. Fill missing values or drop them as needed (for this one we can discuss among us which is the best method to deal with 
       missingness for each dataset, as there are a lot of ways to do them.
                                        

### Cumulative Enrollment

In [5]:
#Removed insignificant columns
#Changed all * values (missing for student privacy) to NaN, rest of CumulativeEnrollment to float.
cumulative_clean = cumulative_enrollment.drop(["AcademicYear", "CountyCode", "DistrictCode", "SchoolCode", "ReportingCategory"], axis=1) #Dropped codes
cumulative_clean["CumulativeEnrollment"] = pd.to_numeric(cumulative_clean["CumulativeEnrollment"], errors='coerce')

In [10]:
cumulative_clean.head(3)

Unnamed: 0,AggregateLevel,CountyName,DistrictName,SchoolName,Charter,CumulativeEnrollment
0,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,90.0
1,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,
2,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,


### COVID-19 Stats in San Diego 

In [9]:
stats_sd["date"] = pd.to_datetime(stats_sd["date"])
stats_sd.head(3)

Unnamed: 0,X,Y,objectid,date,tests,positives,hospitalized,icu,deaths,newcases,...,age80_plus,ageunknow,age20_29,genderfemale,gendermale,gendeunk,age30_39,globalid,newtests,rolling_perc_pos_cases
0,-12994880.0,3899796.0,59,2020-03-11 08:00:00+00:00,123.0,5,,,,1,...,,,,,,,,{78698F35-A6E9-48CB-8C8D-67A70EC15C1D},,
1,-12994880.0,3899796.0,60,2020-03-12 08:00:00+00:00,147.0,10,,,,5,...,,,,,,,,{EFA2B0DB-A692-4632-8495-14B46E240096},,
2,-12994880.0,3899796.0,61,2020-03-13 08:00:00+00:00,273.0,19,,,,9,...,,,,,,,,{AA7DF18A-6303-4758-A5F1-9B9CD4388A92},52.0,


### Border Crossing

In [53]:
months = ["January", "February", "March", "April", "May", "June", "July"]
border = pd.read_csv("Datasets/border_crossings.csv")
#Dropped nans
border = border.dropna(axis=0)
for month in months:
    border[month] = border[month].str.replace(',', '').astype(int)
border.head(3)

Unnamed: 0,Port Name,Measure,January,February,March,April,May,June,July
0,Andrade,Pedestrians,113254,115655,75638,11190,23217,21886,19720
1,Andrade,Personal Vehicle Passengers,87090,82972,70204,29720,30962,29780,30439
2,Andrade,Personal Vehicles,46520,44277,39473,19102,18634,17926,18480


### Public Schools

In [54]:
schools = pd.read_csv('Datasets/pubschools_info.csv')
# header = schls_raw.iloc[0]
# schools = schls_raw[1:]
# schools.columns = header
# schools = schools.reset_index()
# schools = schools.drop('index', 1)

schools = schools.loc[schools['County'] == 'San Diego']

# border_counties = ['San Diego']
# border_schools = schools.loc[schools['County'].isin(border_counties)]
# border_schools
cols_to_drop = ["NCESDist","NCESSchool","StreetAbr","MailStreet","MailStrAbr","MailCity",
               "MailZip","MailState","Ext","FaxNumber","Email","Phone","WebSite","CharterNum",
                "FundingType","DOC","DOCType","EdOpsCode","EdOpsName","Magnet","FederalDFCDistrictID",
                "AdmFName","AdmLName","AdmEmail","YearRoundYN",'Charter','State']
schools = schools.drop(cols_to_drop, axis=1)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [55]:
clean1 = schools.loc[schools['StatusType'] != 'Closed'] #drops closed schools
clean2 = clean1.loc[clean1['School'] != 'No Data'] #drops district records - nonschools
clean3 = clean2.loc[clean2['GSoffered'] != 'No Data'] #drops GSoffered no data schools
clean3['LastUpDate'] = pd.to_datetime(clean3['LastUpDate']) #change lastUpDate to datetime obj col
clean3 = clean3.reset_index().drop('index',axis = 1)
schools = clean3

schools = schools.replace("No Data",np.nan) #changed "No Data" into NaNs
schools['Latitude'] = schools['Latitude'].astype(float) #changed latitutde to float dtype
schools['Longitude'] = schools['Longitude'].astype(float) #changed longitude to float dtype
schools = schools.drop(['StatusType','County'],axis = 1) #dropped statustype and county col since its all active and SD county
schools['ClosedDate'] = pd.to_datetime(schools['ClosedDate']) #changed ClosedDate to datetime obj col
schools['OpenDate'] = pd.to_datetime(schools['OpenDate']) # Changed OpenDate to datetime obj col
# schools['CDSCode'] = schools['CDSCode'].astype(float) #changed CDSCode col to int dtype

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean3['LastUpDate'] = pd.to_datetime(clean3['LastUpDate']) #change lastUpDate to datetime obj col


In [56]:
unique_zips = len(schools['Zip'].unique())
total_zips = len(schools['Zip'])
print("There are {} unique zips out of {}.".format(unique_zips,total_zips))

There are 707 unique zips out of 808.


In [57]:
schools

Unnamed: 0,CDSCode,District,School,Street,City,Zip,OpenDate,ClosedDate,SOC,SOCType,EILCode,EILName,GSoffered,GSserved,Virtual,Latitude,Longitude,LastUpDate
0,37103710108548,San Diego County Office of Education,Iftin Charter,5465 El Cajon Boulevard,San Diego,92115-3620,2006-09-05,NaT,60,Elementary Schools (Public),ELEM,Elementary,K-8,K-8,N,32.757908,-117.07814,2020-08-13
1,37103710115998,San Diego County Office of Education,San Pasqual Academy,17701 San Pasqual Valley Road,Escondido,92025-5301,2007-07-01,NaT,14,Juvenile Court Schools,HS,High School,9-12,9-12,C,33.090580,-116.95009,2019-02-13
2,37103710120485,San Diego County Office of Education,Davila Day,540 G Street,Chula Vista,91910-3604,2009-07-27,NaT,09,Special Education Schools (Public),ELEM,Elementary,P-8,K-6,N,32.635041,-117.08832,2019-02-13
3,37103710120493,San Diego County Office of Education,Monarch,1625 Newton Avenue,San Diego,92113-1012,2009-07-01,NaT,10,County Community,ELEMHIGH,Elementary-High Combination,K-12,K-12,C,32.702375,-117.15045,2020-08-24
4,37103710128520,San Diego County Office of Education,San Diego County Community,"6401 Linda Vista Road, Room 216",San Diego,92111-7399,2013-07-01,NaT,10,County Community,ELEMHIGH,Elementary-High Combination,K-12,6-12,C,32.769578,-117.17984,2020-08-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
803,37770990136077,SBE - Grossmont Secondary,Grossmont Secondary,111 Fletcher Parkway,El Cajon,92020-2510,2017-07-05,NaT,67,High Schools In 1 School Dist. (Public),HS,High School,7-12,7-12,N,32.806734,-116.96305,2020-08-13
804,37771070136473,SBE - Sweetwater Secondary,Sweetwater Secondary,3252 Bonita Road,Chula Vista,91910-3200,2017-07-31,NaT,67,High Schools In 1 School Dist. (Public),HS,High School,7-12,7-12,N,32.648779,-117.05338,2020-08-13
805,37771560137323,SBE - Vista Springs Charter,Vista Springs Charter,700 East Bobier Avenue,Vista,92084-3804,2018-07-01,NaT,65,K-12 Schools (Public),ELEMHIGH,Elementary-High Combination,K-12,K-12,N,33.220678,-117.23443,2019-03-27
806,37771640137356,SBE - College Preparatory Middle,College Preparatory Middle,10269 Madrid Way,Spring Valley,91977-1928,2018-09-04,NaT,62,Intermediate/Middle Schools (Public),INTMIDJR,Intermediate/Middle/Junior High,5-8,5-8,N,32.745829,-116.97130,2020-08-13


### Incomes

In [61]:
incomes_path = os.path.join('Datasets', 'suave_income.csv')
incomes = pd.read_csv(incomes_path, low_memory=False)

In [70]:
list(incomes.columns)

['Community',
 'ZIP',
 'geometry#hiddenmore',
 '#name',
 '#img',
 'Population 16+yo#number',
 'Population 16+yo in labor force#number',
 'Population 16+yo in civilian labor force#number',
 'Population 16+yo in civilian lf - employed#number',
 'Population 16+yo in civilian lf - unemployed#number',
 'Population 16+yo - in Armed Forces#number',
 'Population 16+yo not in labor force#number',
 'Females 16+yo#number',
 'Children of the hh under 6 yo#number',
 'Children of the hh 6-17 yo#number',
 'COMMUTING TO WORK among 16+yo#number',
 'Driving alone to work in Car, truck, or van#number',
 'Carpooling to work in Car, truck, or van#number',
 'Public transport to work#number',
 'Walked to work#number',
 'Other types of commuting to work#number',
 'Worked at home#number',
 'Mean travel to work (minutes)#number',
 'Management, business, science, and arts occupations#number',
 'Service occupations#number',
 'Sales and office occupations#number',
 'Natural resources, construction, and maintenance

In [107]:
incomes_classes = incomes[["ZIP", 
         "Hholds with income Less than $10,000#number", 
         'Hholds with income $10,000 to $14,999#number',
         'Hholds with income $15,000 to $24,999#number',
         'Hholds with income $25,000 to $34,999#number',
         'Hholds with income $35,000 to $49,999#number',
         'Hholds with income $50,000 to $74,999#number',
         'Hholds with income $75,000 to $99,999#number',
         'Hholds with income $100,000 to $149,999#number',
         'Hholds with income $150,000 to $199,999#number',
         'Hholds with income $200,000 or more#number',
         'Median family income (dollars)#number',
         'Mean family income (dollars)#number',
         ]]
incomes_clean = incomes_classes.copy()
incomes_clean["Households with income $0-34,999"] = (incomes_classes["Hholds with income Less than $10,000#number"] 
                                               + incomes_classes["Hholds with income $10,000 to $14,999#number"] 
                                               + incomes_classes["Hholds with income $15,000 to $24,999#number"] 
                                               + incomes_classes["Hholds with income $25,000 to $34,999#number"]
                                              )
incomes_clean["Households with income $35,000-99,999"] = (incomes_classes["Hholds with income $35,000 to $49,999#number"] 
                                               + incomes_classes["Hholds with income $50,000 to $74,999#number"] 
                                               + incomes_classes["Hholds with income $75,000 to $99,999#number"] 
                                              )
incomes_clean["Households with income $100,000+"] = (incomes_classes["Hholds with income $100,000 to $149,999#number"] 
                                               + incomes_classes["Hholds with income $150,000 to $199,999#number"]
                                               + incomes_classes["Hholds with income $200,000 or more#number"] 
                                              ) 
incomes_clean = incomes_clean.drop(["Hholds with income Less than $10,000#number", 
         'Hholds with income $10,000 to $14,999#number',
         'Hholds with income $15,000 to $24,999#number',
         'Hholds with income $25,000 to $34,999#number',
         'Hholds with income $35,000 to $49,999#number',
         'Hholds with income $50,000 to $74,999#number',
         'Hholds with income $75,000 to $99,999#number',
         'Hholds with income $100,000 to $149,999#number',
         'Hholds with income $150,000 to $199,999#number',
         'Hholds with income $200,000 or more#number',], axis=1)
incomes_clean.head(3)

Unnamed: 0,ZIP,Median family income (dollars)#number,Mean family income (dollars)#number,"Households with income $0-34,999","Households with income $35,000-99,999","Households with income $100,000+"
0,92536,76818.0,79336.0,353.0,425.0,283.0
1,91901,103026.0,116390.0,1241.0,2264.0,2915.0
2,91902,104025.0,121292.0,835.0,2416.0,2827.0


In [80]:
health_insurance = incomes[['ZIP', 
                            'Health insurance coverage, % of civilian pop#number',
         'No health insurance for civil pop#number',
         'No health insurance coverage, %#number',]]
health_insurance.head(3)

Unnamed: 0,ZIP,"Health insurance coverage, % of civilian pop#number",No health insurance for civil pop#number,"No health insurance coverage, %#number"
0,92536,90.3,292.0,9.7
1,91901,93.0,1265.0,7.0
2,91902,88.6,2235.0,11.4


In [82]:
worked_at_home = incomes[['ZIP', 'Worked at home#number']]
worked_at_home.head(3)

Unnamed: 0,ZIP,Worked at home#number
0,92536,43.0
1,91901,788.0
2,91902,523.0


# Data Exploration

In [41]:
cum_en = cumulative_clean["CumulativeEnrollment"]