In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Objective & Summary  
What is NCES?  
  
_The National Center for Education Statistics (NCES) is the part of the United States Department of Education's Institute of Education Sciences (IES) that collects, analyzes, and publishes statistics on education and public school district finance information in the United States_   
Source: [Wikipedia](https://en.wikipedia.org/wiki/National_Center_for_Education_Statistics)

We will explore the following points in this dataset:  
1. 

## Operations on Data

In [None]:
pub_df = pd.read_csv('/kaggle/input/us-schools-dataset/Public_Schools.csv')
pvt_df = pd.read_csv('/kaggle/input/us-schools-dataset/Private_Schools.csv')

Removed Column `X` and Column `Y` as:
1. They were not included in the original data as per [link](https://hifld-geoplatform.opendata.arcgis.com/datasets/public-schools/data)  
2. It is not clear what these columns are trying to describe.

In [None]:
pub_df.drop(['X','Y'], axis=1, inplace=True)
pvt_df.drop(['X','Y'], axis=1, inplace=True)

2. Set Column `OBJECT_ID` as index for the tables.  

In [None]:
# Since 'OBJECT_ID' was observed to contain unique numbers(equal to total number of records) in serial/order

pub_df.set_index(['OBJECTID'], inplace=True)
pvt_df.set_index(['OBJECTID'], inplace=True)

3. Removing `NAICS_CODE` and `NAICS_DESC` as well from both the datasets.  
Since the columns have a constant value and can be removed.

In [None]:
# all the records have the same description - all are `ELEMENTARY AND SECONDARY SCHOOLS`
print(pub_df.NAICS_CODE.value_counts())
print(pvt_df.NAICS_CODE.value_counts())
print(pub_df.NAICS_DESC.value_counts())
print(pvt_df.NAICS_DESC.value_counts())

In [None]:
pub_df.drop(['NAICS_CODE','NAICS_DESC'], axis=1, inplace=True)
pvt_df.drop(['NAICS_CODE','NAICS_DESC'], axis=1, inplace=True)

---
## Knowledge on some columns
1. `ZIP` and `ZIP4` code  
USA has 5 digit zipcodes. An additional 4-digit ensures fastest, most accurate billing possible. For more info - [Click Here](https://smartystreets.com/articles/zip-4-code)  
So the field `ZIP4` adds in a level of granularity in terms of geography.
2. `TYPE` column  
Represents what type of the school is.
3. `STATUS` column  
Represents what is the current operational status of school

## Getting information on `TYPE` column  
Using the source link provided in dataset, we can get information on what `TYPE` column represents. Also, as we search for a school using search option, we can confirm there can be 4 different type of schools.

In [None]:
pub_df['TYPE'].value_counts()

In [None]:
groups=pub_df.groupby('TYPE')

pd.set_option('display.max_colwidth', None)
for name, df in groups:
    print(name, df.iloc[0][['NCESID','NAME','SOURCE']], sep='\n')

### Public School `TYPE` labels  
| Label | Definition |
| :-: | :- |
| 1 | Regular School |
| 2 | Special education school |
| 3 | Vocational school |
| 4 | Other/alternative school |


In [None]:
# doing the same for private schools
pvt_df['TYPE'].value_counts()

In [None]:
pvt_groups = pvt_df.groupby('TYPE')
for name,df in pvt_groups:
    print(name, df.iloc[0][['NCESID','NAME','SOURCE']], sep='\n')

We can get more info on this column by following the links in other records. However, I am not using this data.

## Information on `STATUS` column  
Following the source link provided in dataset, we can get information on what `STATUS` column represents. We cannot use this information in dataset as it might have been updated on the source website.

In [None]:
pub_df['STATUS'].value_counts()

___

In [None]:
pub_df.columns

In [None]:
pub_df[['ZIP','ZIP4']].sample(10)

In [None]:
pub_df.NCESID.nunique(),pvt_df.NCESID.nunique()