In [105]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import os

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Benjamin: cumulative_enrollment, stats_sd

Andrew: pubschools_info

Alex: border_crossings

In [106]:
border_crossings_path = os.path.join('Datasets', 'border_crossings.csv')
border_crossings = pd.read_csv(border_crossings_path, low_memory=False)

cumulative_enrollment_path = os.path.join('Datasets', 'cumulative_enrollment.csv')
cumulative_enrollment = pd.read_csv(cumulative_enrollment_path, low_memory=False)

pubschools_info_path = os.path.join('Datasets', 'pubschools_info.csv')
pubschools_info = pd.read_csv(pubschools_info_path, low_memory=False)

stats_sd_path = os.path.join('Datasets', 'COVID_19_Statistics_San_Diego_County.csv')
stats_sd = pd.read_csv(stats_sd_path, low_memory=False)

#I'm having a bit of trouble loading this one in for some reason, I'll check it out later.
#age_stats_sd_path = os.path.join('Datasets', 'COVID-19_Age_Statistics_by_ZIP_Code')
#age_stats_sd = pd.read_csv(age_stats_sd_path, low_memory=False)

### Checklist:
    1. Get dtypes in order
        change dates to object datetime
        natural numbers to int
        decimals to float
    2. Remove columns that are not relevant
    3. Remove the unneeded symbols from string data
    4. Replace null/blank/missing values with NaN
    5. Look for and understand suspicious or out of place values for each column
    6. Fill missing values or drop them as needed (for this one we can discuss among us which is the best method to deal with 
       missingness for each dataset, as there are a lot of ways to do them.
                                        

# Cumulative Enrollment

In [120]:
#Removed insignificant columns
#Changed all * values (missing for student privacy) to NaN, rest of CumulativeEnrollment to float.
cumulative_clean = cumulative_enrollment.drop(["AcademicYear", "CountyCode", "DistrictCode", "SchoolCode", "ReportingCategory"], axis=1) #Dropped codes
cumulative_clean["CumulativeEnrollment"] = pd.to_numeric(cumulative_clean["CumulativeEnrollment"], errors='coerce')

In [121]:
cumulative_clean

Unnamed: 0,AggregateLevel,CountyName,DistrictName,SchoolName,Charter,CumulativeEnrollment
0,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,90.0
1,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,
2,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,
3,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,18.0
4,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,110.0
...,...,...,...,...,...,...
355142,C,Imperial,,,No,38536.0
355143,C,San Mateo,,,No,88986.0
355144,C,Santa Barbara,,,No,66537.0
355145,C,Kern,,,No,183911.0


# COVID-19 Stats in San Diego 

In [136]:
stats_sd["date"] = pd.to_datetime(stats_sd["date"])
stats_sd

Unnamed: 0,X,Y,objectid,date,tests,positives,hospitalized,icu,deaths,newcases,...,age80_plus,ageunknow,age20_29,genderfemale,gendermale,gendeunk,age30_39,globalid,newtests,rolling_perc_pos_cases
0,-1.299488e+07,3.899796e+06,59,2020-03-11 08:00:00+00:00,123.0,5,,,,1,...,,,,,,,,{78698F35-A6E9-48CB-8C8D-67A70EC15C1D},,
1,-1.299488e+07,3.899796e+06,60,2020-03-12 08:00:00+00:00,147.0,10,,,,5,...,,,,,,,,{EFA2B0DB-A692-4632-8495-14B46E240096},,
2,-1.299488e+07,3.899796e+06,61,2020-03-13 08:00:00+00:00,273.0,19,,,,9,...,,,,,,,,{AA7DF18A-6303-4758-A5F1-9B9CD4388A92},52.0,
3,-1.299488e+07,3.899796e+06,62,2020-03-14 08:00:00+00:00,288.0,25,12.0,,,6,...,,,,,,,,{9A014068-714D-4FD4-B011-A16A1A49C07C},14.0,
4,-1.299488e+07,3.899796e+06,63,2020-03-15 08:00:00+00:00,313.0,37,10.0,,,12,...,,,,,,,,{DA54D0B2-3C90-4FF2-8090-BE3BDB8C1B4E},25.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,-1.299488e+07,3.899796e+06,245,2020-09-06 08:00:00+00:00,880321.0,40866,3225.0,777.0,708.0,216,...,1330.0,33.0,10364.0,20965.0,19788.0,113.0,7569.0,{EB1D16C5-D8A2-483F-AAE1-4B7518731E5C},6788.0,4.312512
185,-1.299488e+07,3.899796e+06,246,2020-09-08 08:00:00+00:00,880321.0,41324,3237.0,773.0,721.0,247,...,1343.0,34.0,10493.0,21179.0,20035.0,110.0,7630.0,{407E67B6-999D-4CF4-9254-EAC378C41528},6788.0,4.512304
186,-1.299488e+07,3.899796e+06,247,2020-09-09 08:00:00+00:00,897915.0,41608,3253.0,775.0,725.0,284,...,1351.0,38.0,10542.0,21325.0,20173.0,110.0,7659.0,{4C5A4AAD-D2E0-4ACF-B538-F3917A43155D},8311.0,4.372067
187,-1.299488e+07,3.899796e+06,248,2020-09-10 08:00:00+00:00,907123.0,41969,3266.0,775.0,730.0,361,...,1355.0,35.0,10637.0,21489.0,20368.0,112.0,7721.0,{87BFD444-16C0-4465-975D-46027BDCBB3A},9208.0,4.434672
