In [105]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import os

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Benjamin: cumulative_enrollment, stats_sd

Andrew: pubschools_info

Alex: border_crossings

In [106]:
border_crossings_path = os.path.join('Datasets', 'border_crossings.csv')
border_crossings = pd.read_csv(border_crossings_path, low_memory=False)

cumulative_enrollment_path = os.path.join('Datasets', 'cumulative_enrollment.csv')
cumulative_enrollment = pd.read_csv(cumulative_enrollment_path, low_memory=False)

pubschools_info_path = os.path.join('Datasets', 'pubschools_info.csv')
pubschools_info = pd.read_csv(pubschools_info_path, low_memory=False)

stats_sd_path = os.path.join('Datasets', 'COVID_19_Statistics_San_Diego_County.csv')
stats_sd = pd.read_csv(stats_sd_path, low_memory=False)

#I'm having a bit of trouble loading this one in for some reason, I'll check it out later.
#age_stats_sd_path = os.path.join('Datasets', 'COVID-19_Age_Statistics_by_ZIP_Code')
#age_stats_sd = pd.read_csv(age_stats_sd_path, low_memory=False)

### Checklist:
    1. Get dtypes in order
        change dates to object datetime
        natural numbers to int
        decimals to float
    2. Remove columns that are not relevant
    3. Remove the unneeded symbols from string data
    4. Replace null/blank/missing values with NaN
    5. Look for and understand suspicious or out of place values for each column
    6. Fill missing values or drop them as needed (for this one we can discuss among us which is the best method to deal with 
       missingness for each dataset, as there are a lot of ways to do them.
                                        

# Cumulative Enrollment

In [120]:
#Removed insignificant columns
#Changed all * values (missing for student privacy) to NaN, rest of CumulativeEnrollment to float.
cumulative_clean = cumulative_enrollment.drop(["AcademicYear", "CountyCode", "DistrictCode", "SchoolCode", "ReportingCategory"], axis=1) #Dropped codes
cumulative_clean["CumulativeEnrollment"] = pd.to_numeric(cumulative_clean["CumulativeEnrollment"], errors='coerce')

In [121]:
cumulative_clean

Unnamed: 0,AggregateLevel,CountyName,DistrictName,SchoolName,Charter,CumulativeEnrollment
0,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,90.0
1,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,
2,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,
3,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,18.0
4,S,Orange,Saddleback Valley Unified,Portola Hills Elementary,All,110.0
...,...,...,...,...,...,...
355142,C,Imperial,,,No,38536.0
355143,C,San Mateo,,,No,88986.0
355144,C,Santa Barbara,,,No,66537.0
355145,C,Kern,,,No,183911.0


# COVID-19 Stats in San Diego 

In [135]:
stats_sd["date"] = pd.to_datetime(stats_sd["date"])
stats_sd.dtypes

X                                     float64
Y                                     float64
objectid                                int64
date                      datetime64[ns, UTC]
tests                                 float64
positives                               int64
hospitalized                          float64
icu                                   float64
deaths                                float64
newcases                                int64
age_9                                 float64
age10_19                              float64
age40_49                              float64
age50_59                              float64
age60_69                              float64
age70_79                              float64
age80_plus                            float64
ageunknow                             float64
age20_29                              float64
genderfemale                          float64
gendermale                            float64
gendeunk                          