# Extract Task

In [1]:
import io
import pandas as pd
import requests
pd.options.display.max_rows=1000
pd.options.display.max_columns=100

### Download Excel file directly from website, keep it in memory via BytesIO, and then read it into a dataframe:

In [2]:
URL = 'http://www.tos.ohio.gov/Documents/Transparency/2017-2018-Teacher-Data-for-website.xlsx'
resp = requests.get(URL)
file_obj = io.BytesIO()
file_obj.write(resp.content)
file_obj.seek(0)   # Move file pointer to the beginning or else pandas will think the file is empty
df = pd.read_excel(file_obj, index=False)

In [3]:
df.shape

(350283, 11)

In [4]:
df.head()

Unnamed: 0,LAST NAME,FIRST NAME,EDUCATION LEVEL,JOB DESCRIPTION,DISTRICT NAME,COUNTY,SCHOOL NAME,CITY NAME,DAYS WORKED,HOURS WORKED PER DAY,PAY AMOUNT
0,LONGDEN,CHERYL,Masters,Tutor/Small Group Instructor Assignment (Serve...,Ashtabula Area City,Ashtabula,Ashtabula Area City,Ashtabula,185,7.0,25900.0
1,Mulligan,Bettylene,Masters,Teacher Assignment,Winton Preparatory Academy,Hamilton,Winton Preparatory Academy,Cincinnati,86,8.0,34000.0
2,SMITH,SHERRY,Masters,Other Professional - Other Assignment,Greene County ESC,Greene,Greene County ESC,Yellow Springs,40,8.0,10397.0
3,BALOG,GEORGE,Masters,Teacher Assignment,Dayton City,Montgomery,Stivers School For The Arts,Dayton,200,4.35,40792.0
4,BAUER,RICHARD,Masters,Teacher Assignment,North Olmsted City,Cuyahoga,North Olmsted Middle School,North Olmsted,185,7.5,97931.0


In [5]:
df.columns

Index(['LAST NAME', 'FIRST NAME', 'EDUCATION LEVEL', 'JOB DESCRIPTION',
       'DISTRICT NAME', 'COUNTY', 'SCHOOL NAME', 'CITY NAME', 'DAYS WORKED',
       'HOURS WORKED PER DAY', 'PAY AMOUNT'],
      dtype='object')

### I don't like spaces in column names, so I will replace them with underscores:

In [6]:
df.columns = [column.replace(' ','_') for column in df.columns]

In [7]:
df.columns

Index(['LAST_NAME', 'FIRST_NAME', 'EDUCATION_LEVEL', 'JOB_DESCRIPTION',
       'DISTRICT_NAME', 'COUNTY', 'SCHOOL_NAME', 'CITY_NAME', 'DAYS_WORKED',
       'HOURS_WORKED_PER_DAY', 'PAY_AMOUNT'],
      dtype='object')

### Let's see what job descriptions there are and get counts:

In [8]:
df['JOB_DESCRIPTION'].value_counts()

Teacher Assignment                                                                              104185
Coaching Assignment                                                                              44104
Advisor Assignment                                                                               27616
Vehicle Operating (Bus) Assignment                                                               14061
Other Extra/Intra - Curricular Activities Assignment                                             13806
Food Service Assignment                                                                          13518
Teaching Aide Assignment                                                                         12948
Custodian Assignment                                                                             11872
Instructional Paraprofessional Assignment                                                        11497
Clerical Assignment                                                      

### Let's only do analysis on teachers ("Teacher Assignment"):

In [9]:
teachers = df.query("JOB_DESCRIPTION == 'Teacher Assignment'")

### Save our filtered data set as a csv file:

In [10]:
teachers.to_csv('/home/pybokeh/Downloads/teachers.csv', index=False)