# Transform Task

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scrapbook as sb
import seaborn as sns
pd.options.display.max_rows=1000
pd.options.display.max_columns=100
plt.rcParams['figure.figsize'] = [9, 6]

### Read in data set limited to just teachers:

In [None]:
teachers = pd.read_csv('/home/pybokeh/Downloads/teachers.csv')

### Looks like teachers for the most part work about 6 months out of the year:

In [None]:
teachers['DAYS_WORKED'].value_counts()

In [None]:
teachers['DAYS_WORKED'].hist()
sns.despine();

In [None]:
sns.distplot(teachers['DAYS_WORKED'].values)
sns.despine()

### Also curious about hours worked per day:

In [None]:
teachers['HOURS_WORKED_PER_DAY'].value_counts()

In [None]:
teachers['HOURS_WORKED_PER_DAY'].hist()
sns.despine();

In [None]:
sns.distplot(teachers['HOURS_WORKED_PER_DAY'].values)
sns.despine()

#### So it appears most teachers work at least 7 hours per day

#### Based on these findings, we should probably limit our data set to "full-time" teachers.  We will define full-time to mean having worked at least 180 days out of the year and at least 7 hours per day:

### Limit to just full-time teachers:

In [None]:
full_time = teachers.query("HOURS_WORKED_PER_DAY >=7 and DAYS_WORKED >= 180")

### Let's calculate hourly rate:

In [None]:
full_time = full_time.assign(TOTAL_HOURS_WORKED = full_time['DAYS_WORKED'] * full_time['HOURS_WORKED_PER_DAY'])
full_time = full_time.assign(HRLY_RATE = full_time['PAY_AMOUNT'] / full_time['TOTAL_HOURS_WORKED'])

In [None]:
full_time.shape

In [None]:
full_time.head()

In [None]:
full_time.info()

### Let's now make pivot table to obtain median pay and median hourly rate:

In [None]:
teachers_by_district = full_time.pivot_table(index=['DISTRICT_NAME'], values=['PAY_AMOUNT','HRLY_RATE'],
                                            aggfunc=['median'])

In [None]:
teachers_by_district

### Let's remove the unnecessary column row:

In [None]:
teachers_by_district.columns

In [None]:
teachers_by_district.columns.set_levels(['MEDIAN_HRLY_RATE', 'MEDIAN_PAY'], level=1, inplace=True)

In [None]:
teachers_by_district

In [None]:
teachers_by_district.columns = teachers_by_district.columns.droplevel(level=0)

In [None]:
teachers_by_district

### Let's add district name and county to our data set:

In [None]:
district_info = full_time[['DISTRICT_NAME', 'COUNTY']].drop_duplicates()

In [None]:
district_info.head()

In [None]:
teachers_by_district = pd.merge(teachers_by_district, district_info, how='left', left_index=True,
                               right_on=['DISTRICT_NAME'])

In [None]:
teachers_by_district.head()

### Finally, let's add ```COUNT``` column:

In [None]:
counts_by_district = full_time.pivot_table(index='DISTRICT_NAME', values=['PAY_AMOUNT'], aggfunc='count')

In [None]:
counts_by_district.rename({'PAY_AMOUNT': 'COUNT'}, axis='columns', inplace=True)
counts_by_district

In [None]:
teachers_by_district = pd.merge(teachers_by_district, counts_by_district, how='left', left_on=['DISTRICT_NAME'],
                               right_index=True)

In [None]:
teachers_by_district

### Let's view our final data, sorted by median pay in descending order:

In [None]:
teachers_by_district.sort_values(by='MEDIAN_PAY', ascending=False)

### and view our data, sorted by median hourly rate in descending order:

In [None]:
teachers_by_district.sort_values(by='MEDIAN_HRLY_RATE', ascending=False)

In [None]:
top_median_salary = teachers_by_district.sort_values(by='MEDIAN_PAY', ascending=False)

In [None]:
top_median_hrly_rate = teachers_by_district.sort_values(by='MEDIAN_HRLY_RATE', ascending=False)

In [None]:
top_median_salary[:20]

### Let's modify our index label to be 1 to n:

In [None]:
top_median_salary = top_median_salary[['DISTRICT_NAME', 'COUNTY', 'MEDIAN_HRLY_RATE', 'MEDIAN_PAY', 'COUNT']]
top_median_salary.index = list(range(1, len(top_median_salary.index) +1))

In [None]:
top_median_salary[:20]

In [None]:
top_median_hrly_rate = top_median_hrly_rate[['DISTRICT_NAME', 'COUNTY', 'MEDIAN_HRLY_RATE', 'MEDIAN_PAY', 'COUNT']]
top_median_hrly_rate.index = list(range(1, len(top_median_hrly_rate.index) +1))

In [None]:
top_median_hrly_rate[:20]

In [None]:
top_20_median_salary = top_median_salary[:20].reset_index(drop=True)
top_20_median_salary.index += 1

In [None]:
top_20_median_salary

### Persist the top 20 median salary html report source data by "glue"ing it to this notebook:

In [None]:
sb.glue("HTML_Report", top_20_median_salary.to_html())