In [1]:
from IPython.display import IFrame  
wikiLink="https://en.wikipedia.org/wiki/List_of_freedom_indices" 
IFrame(wikiLink, width=900, height=500)

In [2]:
import pandas as pd

wikiTables=pd.read_html(wikiLink, # link
                        header=0, # where is the header?
                        flavor='bs4', # helper to translate html
                        attrs={'class': 'wikitable sortable'}) # specific element(s)

In [3]:
# What do I have? / How many?
type(wikiTables), len(wikiTables)

(list, 1)

In [4]:
DF=wikiTables[0]

#what is it?
type(DF)

pandas.core.frame.DataFrame

In [5]:
DF.head()

Unnamed: 0,Country,Freedom in the World 2019[10],2019 Index of Economic Freedom[11],2019 Press Freedom Index[3],2018 Democracy Index[14]
0,Afghanistan,not free,mostly unfree,difficult situation,authoritarian regime
1,Albania,partly free,moderately free,noticeable problems,hybrid regime
2,Algeria,not free,repressed,difficult situation,authoritarian regime
3,Andorra,free,,satisfactory situation,
4,Angola,not free,mostly unfree,noticeable problems,authoritarian regime


In [6]:
DFTest=DF.copy() # working with a copy...

In [7]:
# all columns
DFTest.columns=['Country', 'FreedomintheWorld', 'IndexofEconomicFreedom', 
                       'PressFreedomIndex', 'DemocracyIndex']
#final result:
DFTest.head()

Unnamed: 0,Country,FreedomintheWorld,IndexofEconomicFreedom,PressFreedomIndex,DemocracyIndex
0,Afghanistan,not free,mostly unfree,difficult situation,authoritarian regime
1,Albania,partly free,moderately free,noticeable problems,hybrid regime
2,Algeria,not free,repressed,difficult situation,authoritarian regime
3,Andorra,free,,satisfactory situation,
4,Angola,not free,mostly unfree,noticeable problems,authoritarian regime


In [7]:
DF.columns

Index(['Country', 'Freedom in the World 2019[10]',
       '2019 Index of Economic Freedom[11]', '2019 Press Freedom Index[3]',
       '2018 Democracy Index[14]'],
      dtype='object')

In [8]:
import re

In [9]:
textExample="text1 text2   text3[2]"

In [10]:
re.sub('\\s',"",textExample)

'text1text2text3[2]'

In [11]:
re.sub('\\d',"",textExample)

'text text   text[]'

In [12]:
re.sub('\\[',"",textExample)

'text1 text2   text32]'

In [13]:
# one or more blanks: \\s+
# one or more numbers: \\d+
# find opening bracket : \\[
# find closing bracket: \\]

# You can combine using '|' (or):
pattern='\\s+|\\d+|\\[|\\]'
nothing=''

# use it:
re.sub(pattern,nothing,textExample)

'texttexttext'

In [14]:
re.sub(pattern,nothing,DF.columns)

TypeError: expected string or bytes-like object

In [15]:
DF.columns.str.replace(pattern,nothing)

Index(['Country', 'FreedomintheWorld', 'IndexofEconomicFreedom',
       'PressFreedomIndex', 'DemocracyIndex'],
      dtype='object')

In [16]:
newNames1=[] #empty list
for text in DF.columns: # for each element 
    # do this:
    newNames1.append(re.sub(pattern,nothing,text)) # filling in the list

In [17]:
#Take a look:
newNames1

['Country',
 'FreedomintheWorld',
 'IndexofEconomicFreedom',
 'PressFreedomIndex',
 'DemocracyIndex']

In [18]:
# "lambda" style function
# name of function is 'cleaner'
# function transforms INPUT
# lambda x: f(x)

cleaner = lambda INPUT: re.sub(pattern,nothing,INPUT)

In [19]:
# mapping the function to every element:

NewNames2=list(map(cleaner,DF.columns))
NewNames2

['Country',
 'FreedomintheWorld',
 'IndexofEconomicFreedom',
 'PressFreedomIndex',
 'DemocracyIndex']

In [20]:
NewNames3=[cleaner(TEXT) for TEXT in DF.columns] #new list on the run

# You get:
NewNames3

['Country',
 'FreedomintheWorld',
 'IndexofEconomicFreedom',
 'PressFreedomIndex',
 'DemocracyIndex']

In [21]:
NewNames4=[re.sub(pattern,nothing,TEXT) for TEXT in DF.columns]

# You get:
NewNames4

['Country',
 'FreedomintheWorld',
 'IndexofEconomicFreedom',
 'PressFreedomIndex',
 'DemocracyIndex']

In [22]:
DF.iloc[:,1::].columns

Index(['Freedom in the World 2019[10]', '2019 Index of Economic Freedom[11]',
       '2019 Press Freedom Index[3]', '2018 Democracy Index[14]'],
      dtype='object')

In [23]:
DF.iloc[:,1::].columns.str.replace(pattern,nothing)

Index(['FreedomintheWorld', 'IndexofEconomicFreedom', 'PressFreedomIndex',
       'DemocracyIndex'],
      dtype='object')

In [24]:
# saving result :
someNewNames=DF.iloc[:,1::].columns.str.replace(pattern,nothing)
# which are:
someNewNames

Index(['FreedomintheWorld', 'IndexofEconomicFreedom', 'PressFreedomIndex',
       'DemocracyIndex'],
      dtype='object')

In [25]:
# using 'zip()':
list(zip(DF.iloc[:,1::].columns,someNewNames))

[('Freedom in the World 2019[10]', 'FreedomintheWorld'),
 ('2019 Index of Economic Freedom[11]', 'IndexofEconomicFreedom'),
 ('2019 Press Freedom Index[3]', 'PressFreedomIndex'),
 ('2018 Democracy Index[14]', 'DemocracyIndex')]

In [26]:
changes={old:new for old,new in zip(DF.iloc[:,1::].columns,someNewNames)}
# you have
changes

{'Freedom in the World 2019[10]': 'FreedomintheWorld',
 '2019 Index of Economic Freedom[11]': 'IndexofEconomicFreedom',
 '2019 Press Freedom Index[3]': 'PressFreedomIndex',
 '2018 Democracy Index[14]': 'DemocracyIndex'}

In [27]:
DF.rename(columns=changes,inplace=True)

In [28]:
DF.head()

Unnamed: 0,Country,FreedomintheWorld,IndexofEconomicFreedom,PressFreedomIndex,DemocracyIndex
0,Afghanistan,not free,mostly unfree,difficult situation,authoritarian regime
1,Albania,partly free,moderately free,noticeable problems,hybrid regime
2,Algeria,not free,repressed,difficult situation,authoritarian regime
3,Andorra,free,,satisfactory situation,
4,Angola,not free,mostly unfree,noticeable problems,authoritarian regime


In [29]:
bad=list(changes.keys())
# old ones
bad

['Freedom in the World 2019[10]',
 '2019 Index of Economic Freedom[11]',
 '2019 Press Freedom Index[3]',
 '2018 Democracy Index[14]']

In [32]:
# split the text when you find '['
notSoBad=[TEXT.split("[") for TEXT in bad]
# see:
notSoBad

[['Freedom in the World 2019', '10]'],
 ['2019 Index of Economic Freedom', '11]'],
 ['2019 Press Freedom Index', '3]'],
 ['2018 Democracy Index', '14]']]

In [33]:
# split the text when you find '['and keep first element:
notSoBad=[TEXT.split("[")[0] for TEXT in bad]
# see:
notSoBad

['Freedom in the World 2019',
 '2019 Index of Economic Freedom',
 '2019 Press Freedom Index',
 '2018 Democracy Index']

In [34]:
#split using the blanks
goodEnough=[TEXT.split(" ") for TEXT in notSoBad]
# see:
goodEnough

[['Freedom', 'in', 'the', 'World', '2019'],
 ['2019', 'Index', 'of', 'Economic', 'Freedom'],
 ['2019', 'Press', 'Freedom', 'Index'],
 ['2018', 'Democracy', 'Index']]

In [35]:
#split using the blanks and them concatenate using "join()":
betterThanBad=["".join(TEXT.split(" ")) for TEXT in notSoBad]
# see:
betterThanBad

['FreedomintheWorld2019',
 '2019IndexofEconomicFreedom',
 '2019PressFreedomIndex',
 '2018DemocracyIndex']

In [36]:
#original:
goodEnough[0]

['Freedom', 'in', 'the', 'World', '2019']

In [37]:
# pop will give you the last value
# and will shrink the list.
# that value goes to first position of the old list
# DO NOT RUN TWICE:
goodEnough[0].insert(0,goodEnough[0].pop())
#
goodEnough[0]

['2019', 'Freedom', 'in', 'the', 'World']

In [38]:
goodEnough

[['2019', 'Freedom', 'in', 'the', 'World'],
 ['2019', 'Index', 'of', 'Economic', 'Freedom'],
 ['2019', 'Press', 'Freedom', 'Index'],
 ['2018', 'Democracy', 'Index']]

In [39]:
goodEnough[0]

['2019', 'Freedom', 'in', 'the', 'World']

In [40]:
# you join everything after position 1:
betterThanBad=["".join(TEXT[1::]) for TEXT in goodEnough]
# see:
betterThanBad

['FreedomintheWorld',
 'IndexofEconomicFreedom',
 'PressFreedomIndex',
 'DemocracyIndex']

In [41]:
DF.tail()

Unnamed: 0,Country,FreedomintheWorld,IndexofEconomicFreedom,PressFreedomIndex,DemocracyIndex
200,West Bank,not free,,difficult situation,
201,Western Sahara,not free,,,
202,Yemen,not free,,very serious situation,authoritarian regime
203,Zambia,partly free,mostly unfree,difficult situation,hybrid regime
204,Zimbabwe,partly free,repressed,difficult situation,authoritarian regime


In [42]:
DF.describe()

Unnamed: 0,Country,FreedomintheWorld,IndexofEconomicFreedom,PressFreedomIndex,DemocracyIndex
count,205,204,180,189,167
unique,205,3,5,5,4
top,Norway,free,mostly unfree,noticeable problems,flawed democracy
freq,1,87,64,73,55


In [44]:
DF.FreedomintheWorld.value_counts()

free           87
partly free    62
not free       55
Name: FreedomintheWorld, dtype: int64

In [47]:
# DF.iloc[:,1::] all columns but the first one
# apply(set)  apply fucntion set per column
DF.iloc[:,1::].apply(set)

FreedomintheWorld                        {nan, free, not free, partly free}
IndexofEconomicFreedom    {nan, repressed, free, mostly unfree, mostly f...
PressFreedomIndex         {nan, noticeable problems, difficult situation...
DemocracyIndex            {nan, full democracy, hybrid regime, flawed de...
dtype: object

In [48]:
type(DF.iloc[:,1::].apply(set))

pandas.core.series.Series

In [49]:
#easier to see
DF.iloc[:,1::].apply(set).tolist()

[{'free', nan, 'not free', 'partly free'},
 {'free', 'moderately free', 'mostly free', 'mostly unfree', nan, 'repressed'},
 {'difficult situation',
  'good situation',
  nan,
  'noticeable problems',
  'satisfactory situation',
  'very serious situation'},
 {'authoritarian regime',
  'flawed democracy',
  'full democracy',
  'hybrid regime',
  nan}]

In [50]:
DF.dtypes

Country                   object
FreedomintheWorld         object
IndexofEconomicFreedom    object
PressFreedomIndex         object
DemocracyIndex            object
dtype: object

### Exercise 4

In [51]:
Link="https://en.wikipedia.org/wiki/Democracy_Index" 
IFrame(Link, width=700, height=300)

In [104]:
DFs=pd.read_html(Link,header=0,flavor='bs4',attrs={'class': 'wikitable sortable'})
type(DFs)

list

In [120]:
len(DFs)

1

In [106]:
DFs[0].head()

Unnamed: 0,Rank,Country,Score,Electoral processand pluralism,Functioning ofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
0,1,Norway,9.87,10.0,9.64,10.0,10.0,9.71,Full democracy,Europe
1,2,Iceland,9.58,10.0,9.29,8.89,10.0,9.71,Full democracy,Europe
2,3,Sweden,9.39,9.58,9.64,8.33,10.0,9.41,Full democracy,Europe
3,4,New Zealand,9.26,10.0,9.29,8.89,8.13,10.0,Full democracy,Oceania
4,5,Denmark,9.22,10.0,9.29,8.33,9.38,9.12,Full democracy,Europe


In [107]:
DFs[0].tail()

Unnamed: 0,Rank,Country,Score,Electoral processand pluralism,Functioning ofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
163,164,Central African Republic,1.52,2.25,0.00,1.11,1.88,2.35,Authoritarian,Africa
164,165,Democratic Republic of the Congo,1.49,0.50,0.71,2.22,3.13,0.88,Authoritarian,Africa
165,166,Syria,1.43,0.00,0.00,2.78,4.38,0.00,Authoritarian,Asia
166,167,North Korea,1.08,0.00,2.50,1.67,1.25,0.00,Authoritarian,Asia
167,Rank,Country,Score,Electoral processand pluralism,Functioning ofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent


In [108]:
DF=DFs[0].copy()

In [110]:
DF.iloc[:, [3,4]].columns

Index(['Electoral processand pluralism', 'Functioning ofgovernment'], dtype='object')

# All the spaces in column name to be removed 

In [111]:
pattern='\\s+'
nothing=''

someNewNames=DF.columns.str.replace(pattern,nothing)

In [112]:
changes={old:new for old,new in zip(DF.columns,someNewNames)}

changes

{'Rank': 'Rank',
 'Country': 'Country',
 'Score': 'Score',
 'Electoral processand pluralism': 'Electoralprocessandpluralism',
 'Functioning ofgovernment': 'Functioningofgovernment',
 'Politicalparticipation': 'Politicalparticipation',
 'Politicalculture': 'Politicalculture',
 'Civilliberties': 'Civilliberties',
 'Regimetype': 'Regimetype',
 'Continent': 'Continent'}

In [113]:
DF.rename(columns=changes,inplace=True)

In [114]:
DF.columns

Index(['Rank', 'Country', 'Score', 'Electoralprocessandpluralism',
       'Functioningofgovernment', 'Politicalparticipation', 'Politicalculture',
       'Civilliberties', 'Regimetype', 'Continent'],
      dtype='object')

In [115]:
DF.head()

Unnamed: 0,Rank,Country,Score,Electoralprocessandpluralism,Functioningofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
0,1,Norway,9.87,10.0,9.64,10.0,10.0,9.71,Full democracy,Europe
1,2,Iceland,9.58,10.0,9.29,8.89,10.0,9.71,Full democracy,Europe
2,3,Sweden,9.39,9.58,9.64,8.33,10.0,9.41,Full democracy,Europe
3,4,New Zealand,9.26,10.0,9.29,8.89,8.13,10.0,Full democracy,Oceania
4,5,Denmark,9.22,10.0,9.29,8.33,9.38,9.12,Full democracy,Europe


In [116]:
DF.tail()

Unnamed: 0,Rank,Country,Score,Electoralprocessandpluralism,Functioningofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
163,164,Central African Republic,1.52,2.25,0.00,1.11,1.88,2.35,Authoritarian,Africa
164,165,Democratic Republic of the Congo,1.49,0.50,0.71,2.22,3.13,0.88,Authoritarian,Africa
165,166,Syria,1.43,0.00,0.00,2.78,4.38,0.00,Authoritarian,Asia
166,167,North Korea,1.08,0.00,2.50,1.67,1.25,0.00,Authoritarian,Asia
167,Rank,Country,Score,Electoral processand pluralism,Functioning ofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent


# Last row to be removed

In [117]:
DF_new=DF.iloc[:-1, ].copy()
# DF_new=DF.drop[-1, Axis=0, Inplace=True].copy()

In [119]:
DF_new.tail()

Unnamed: 0,Rank,Country,Score,Electoralprocessandpluralism,Functioningofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
162,163,Chad,1.61,0.0,0.0,1.67,3.75,2.65,Authoritarian,Africa
163,164,Central African Republic,1.52,2.25,0.0,1.11,1.88,2.35,Authoritarian,Africa
164,165,Democratic Republic of the Congo,1.49,0.5,0.71,2.22,3.13,0.88,Authoritarian,Africa
165,166,Syria,1.43,0.0,0.0,2.78,4.38,0.0,Authoritarian,Asia
166,167,North Korea,1.08,0.0,2.5,1.67,1.25,0.0,Authoritarian,Asia


### Excercise 5

In [77]:
import requests

url = "https://data.wa.gov/resource/2cup-2fnu.json?year=2014"
response = requests.get(url)

if response.status_code == 200:
    medicare = response.json()

medicare2014 = pd.DataFrame(medicare)

In [78]:
medicare2014.head()

Unnamed: 0,county,to_sort_by_county_and_year,to_sort_by_year_and_county,year,state_and_county_fips_code,beneficiaries_with_part_a_and_part_b,ffs_beneficiaries,ma_beneficiaries,ma_participation_rate,average_age,percent_female,percent_male,percent_non_hispanic_white,percent_african_american,percent_hispanic,percent_other_unknown,percent_eligible_for_medicaid,average_hcc_score
0,STATE TOTAL,0.2014,2014,2014,.,1098715,739717,358998,32.7,71.0,53.2,46.9,86.3,2.6,3.4,7.7,19.1,0.9
1,ADAMS,530012014.0,201453001,2014,53001,1557,1333,224,14.4,73.0,51.2,48.8,,,,,15.6,0.86
2,ASOTIN,530032014.0,201453003,2014,53003,5426,4515,911,16.8,71.0,51.8,48.2,,,,,18.5,0.93
3,BENTON,530052014.0,201453005,2014,53005,28303,24054,4249,15.0,71.0,53.8,46.2,90.0,0.8,5.1,4.1,15.5,0.92
4,CHELAN,530072014.0,201453007,2014,53007,11040,8884,2156,19.5,72.0,51.3,48.7,91.7,0.2,5.4,2.7,20.2,0.86


### Excercise 6

In [None]:
Link="https://www.cia.gov/library/publications/resources/the-world-factbook/fields/274.html" 
IFrame(Link, width=700, height=300)

In [None]:
### Excercise 7

In [None]:
link="https://docs.google.com/spreadsheets/d/e/2PACX-1vTSP3WQhryCkQsoUSVapHTPhqcbt5CVU-2tM8GVGACsq8oqY9mxXMMTffwqItZAOCliycnICRi8OlC4/pub?gid=39157977&single=true&output=csv" 
fromGoogle=pd.read_csv(link)

fromGoogle.head()

### Another example

In [52]:
dataFile='https://github.com/UWDataScience2020/data/raw/master/wapubs.xlsx'
schoolPub=pd.read_excel(dataFile) 

In [54]:
schoolPub.head(12)

Unnamed: 0,National Center for Education Statistics,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,CCD public school data 2014-2015 school year,,,,,,,,,,...,,,,,,,,,,
1,The file contains (2398) records based on your...,,,,,,,,,,...,,,,,,,,,,
2,NOTES:,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,[ † ] indicates that the data are not applicable.,,,,,,,,,,...,,,,,,,,,,
5,[ – ] indicates that the data are missing.,,,,,,,,,,...,,,,,,,,,,
6,[ ‡ ] indicates that the data do not meet NCES...,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,"SEARCH CRITERIA: State: ""Washington"" - School ...",,,,,,,,,,...,,,,,,,,,,
9,NCES is not responsible for the manner in whic...,,,,,,,,,,...,,,,,,,,,,


In [55]:
# this is easier:
# schoolPub=pd.read_excel(dataFile,skiprows=11)

# let's take another way:
##
# get headers:

Headers=schoolPub.iloc[10,:].tolist()
Headers

['NCES School ID',
 'State School ID',
 'NCES District ID',
 'State District ID',
 'Low Grade*',
 'High Grade*',
 'School Name',
 'District',
 'County Name*',
 'Street Address',
 'City',
 'State',
 'ZIP',
 'ZIP 4-digit',
 'Phone',
 'Locale Code*',
 'Locale*',
 'Charter',
 'Magnet*',
 'Title I School*',
 'Title 1 School Wide*',
 'Students*',
 'Teachers*',
 'Student Teacher Ratio*',
 'Free Lunch*',
 'Reduced Lunch*']

In [57]:
#get data
Data=schoolPub.iloc[11:,]
Data.head()

Unnamed: 0,National Center for Education Statistics,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
11,530486002475,1656,5304860,31025,06,8,10TH STREET SCHOOL,MARYSVILLE SCHOOL DISTRICT,SNOHOMISH COUNTY,7204 27TH AVE NE,...,Suburb: Midsize,No,–,No,†,167.0,7.3,22.9,20.0,7.0
12,530270001270,1646,5302700,6114,KG,12,49TH STREET ACADEMY,EVERGREEN SCHOOL DISTRICT (CLARK),CLARK COUNTY,14619B NE 49TH STREET,...,City: Midsize,No,–,Yes,Yes,123.0,10.1,12.2,75.0,7.0
13,530910002602,4500,5309100,34033,09,12,A G WEST BLACK HILLS HIGH SCHOOL,TUMWATER SCHOOL DISTRICT,THURSTON COUNTY,7741 LITTLEROCK ROAD SW,...,City: Small,No,–,No,†,867.0,41.19,21.0,189.0,45.0
14,530003000001,2834,5300030,14005,PK,6,A J WEST ELEMENTARY,ABERDEEN SCHOOL DISTRICT,GRAYS HARBOR COUNTY,1801 BAY AVE.,...,Town: Remote,No,–,Yes,Yes,410.0,27.63,14.8,330.0,21.0
15,530825002361,1533,5308250,32081,09,12,A-3 MULTIAGENCY ADOLESCENT PROG,SPOKANE SCHOOL DISTRICT,SPOKANE COUNTY,610 E NORTHFOOTHILLS DRIVE,...,City: Midsize,No,–,No,†,22.0,3.1,7.1,16.0,3.0


In [58]:
#Then
schoolPub=Data
schoolPub.columns=Headers

In [59]:
schoolPub.head()

Unnamed: 0,NCES School ID,State School ID,NCES District ID,State District ID,Low Grade*,High Grade*,School Name,District,County Name*,Street Address,...,Locale*,Charter,Magnet*,Title I School*,Title 1 School Wide*,Students*,Teachers*,Student Teacher Ratio*,Free Lunch*,Reduced Lunch*
11,530486002475,1656,5304860,31025,06,8,10TH STREET SCHOOL,MARYSVILLE SCHOOL DISTRICT,SNOHOMISH COUNTY,7204 27TH AVE NE,...,Suburb: Midsize,No,–,No,†,167.0,7.3,22.9,20.0,7.0
12,530270001270,1646,5302700,6114,KG,12,49TH STREET ACADEMY,EVERGREEN SCHOOL DISTRICT (CLARK),CLARK COUNTY,14619B NE 49TH STREET,...,City: Midsize,No,–,Yes,Yes,123.0,10.1,12.2,75.0,7.0
13,530910002602,4500,5309100,34033,09,12,A G WEST BLACK HILLS HIGH SCHOOL,TUMWATER SCHOOL DISTRICT,THURSTON COUNTY,7741 LITTLEROCK ROAD SW,...,City: Small,No,–,No,†,867.0,41.19,21.0,189.0,45.0
14,530003000001,2834,5300030,14005,PK,6,A J WEST ELEMENTARY,ABERDEEN SCHOOL DISTRICT,GRAYS HARBOR COUNTY,1801 BAY AVE.,...,Town: Remote,No,–,Yes,Yes,410.0,27.63,14.8,330.0,21.0
15,530825002361,1533,5308250,32081,09,12,A-3 MULTIAGENCY ADOLESCENT PROG,SPOKANE SCHOOL DISTRICT,SPOKANE COUNTY,610 E NORTHFOOTHILLS DRIVE,...,City: Midsize,No,–,No,†,22.0,3.1,7.1,16.0,3.0


In [60]:
schoolPub.reset_index(drop=True, inplace=True)

In [61]:
schoolPub.head()

Unnamed: 0,NCES School ID,State School ID,NCES District ID,State District ID,Low Grade*,High Grade*,School Name,District,County Name*,Street Address,...,Locale*,Charter,Magnet*,Title I School*,Title 1 School Wide*,Students*,Teachers*,Student Teacher Ratio*,Free Lunch*,Reduced Lunch*
0,530486002475,1656,5304860,31025,06,8,10TH STREET SCHOOL,MARYSVILLE SCHOOL DISTRICT,SNOHOMISH COUNTY,7204 27TH AVE NE,...,Suburb: Midsize,No,–,No,†,167.0,7.3,22.9,20.0,7.0
1,530270001270,1646,5302700,6114,KG,12,49TH STREET ACADEMY,EVERGREEN SCHOOL DISTRICT (CLARK),CLARK COUNTY,14619B NE 49TH STREET,...,City: Midsize,No,–,Yes,Yes,123.0,10.1,12.2,75.0,7.0
2,530910002602,4500,5309100,34033,09,12,A G WEST BLACK HILLS HIGH SCHOOL,TUMWATER SCHOOL DISTRICT,THURSTON COUNTY,7741 LITTLEROCK ROAD SW,...,City: Small,No,–,No,†,867.0,41.19,21.0,189.0,45.0
3,530003000001,2834,5300030,14005,PK,6,A J WEST ELEMENTARY,ABERDEEN SCHOOL DISTRICT,GRAYS HARBOR COUNTY,1801 BAY AVE.,...,Town: Remote,No,–,Yes,Yes,410.0,27.63,14.8,330.0,21.0
4,530825002361,1533,5308250,32081,09,12,A-3 MULTIAGENCY ADOLESCENT PROG,SPOKANE SCHOOL DISTRICT,SPOKANE COUNTY,610 E NORTHFOOTHILLS DRIVE,...,City: Midsize,No,–,No,†,22.0,3.1,7.1,16.0,3.0


In [62]:
schoolPub.tail()

Unnamed: 0,NCES School ID,State School ID,NCES District ID,State District ID,Low Grade*,High Grade*,School Name,District,County Name*,Street Address,...,Locale*,Charter,Magnet*,Title I School*,Title 1 School Wide*,Students*,Teachers*,Student Teacher Ratio*,Free Lunch*,Reduced Lunch*
2393,530813003439,5315,5308130,17406,09,12,YOUTHSOURCE,TUKWILA SCHOOL DISTRICT,KING COUNTY,TUKWILA SCHOOL DISTRICT,...,City: Small,No,–,†,†,0.0,–,†,0.0,0.0
2394,530696002530,4496,5306960,27003,PK,6,ZEIGER ELEMENTARY,PUYALLUP SCHOOL DISTRICT,PIERCE COUNTY,13008 94TH AVE E,...,Suburb: Large,No,–,No,†,823.0,44.53000,18.5000000,221.0,58.0
2395,531017001719,2240,5310170,39205,09,12,ZILLAH HIGH SCHOOL,ZILLAH SCHOOL DISTRICT,YAKIMA COUNTY,1602 SECOND AVENUE,...,Town: Distant,No,–,Yes,Yes,441.0,21.95000,20.1000000,169.0,49.0
2396,531017001896,4221,5310170,39205,04,6,ZILLAH INTERMEDIATE SCHOOL,ZILLAH SCHOOL DISTRICT,YAKIMA COUNTY,303 SECOND AVENUE,...,Town: Distant,No,–,Yes,Yes,319.0,20.38000,15.7000000,145.0,42.0
2397,531017002502,4481,5310170,39205,07,8,ZILLAH MIDDLE SCHOOL,ZILLAH SCHOOL DISTRICT,YAKIMA COUNTY,1301 CUTLER WAY,...,Rural: Fringe,No,–,Yes,Yes,196.0,12.65000,15.5000000,91.0,19.0


In [63]:
schoolPub.columns

Index(['NCES School ID', 'State School ID', 'NCES District ID',
       'State District ID', 'Low Grade*', 'High Grade*', 'School Name',
       'District', 'County Name*', 'Street Address', 'City', 'State', 'ZIP',
       'ZIP 4-digit', 'Phone', 'Locale Code*', 'Locale*', 'Charter', 'Magnet*',
       'Title I School*', 'Title 1 School Wide*', 'Students*', 'Teachers*',
       'Student Teacher Ratio*', 'Free Lunch*', 'Reduced Lunch*'],
      dtype='object')

In [65]:
#cleaning column names:

pattern='\\*|\\s+'
nothing=''
schoolPub.columns=[re.sub(pattern,nothing,columnName) for columnName in schoolPub.columns]
###
schoolPub.columns

Index(['NCESSchoolID', 'StateSchoolID', 'NCESDistrictID', 'StateDistrictID',
       'LowGrade', 'HighGrade', 'SchoolName', 'District', 'CountyName',
       'StreetAddress', 'City', 'State', 'ZIP', 'ZIP4-digit', 'Phone',
       'LocaleCode', 'Locale', 'Charter', 'Magnet', 'TitleISchool',
       'Title1SchoolWide', 'Students', 'Teachers', 'StudentTeacherRatio',
       'FreeLunch', 'ReducedLunch'],
      dtype='object')

In [66]:
schoolPub.iloc[:,[19,20]].apply(set)

TitleISchool        {No, †, Yes}
Title1SchoolWide    {No, †, Yes}
dtype: object

In [67]:
# try to do something (turn into a 'float")
try:
    for i in schoolPub.StudentTeacherRatio:
        float(i)
# if you het an error:
except:
    print("found:",i)

found: †


In [68]:
try:
    for i in schoolPub.Teachers:
        float(i)
except:
    print("found:",i)

found: –


In [69]:
badSymbols=[]
try:
    for i in schoolPub.StudentTeacherRatio:
        float(i)
# if you het an error:
except:
    badSymbols.append(i)
    
try:
    for i in schoolPub.Teachers:
        float(i)
except:
    badSymbols.append(i)

In [70]:
# you got:
badSymbols

['†', '–']

In [71]:
toNA=['†','‡','–'," "] # last one 'just in case'

import numpy as np  #numpy manages the missing values for pandas
schoolPub.replace(to_replace=toNA,value=np.nan,inplace=True) # in the whole data frame!!

In [72]:
# any missing?

schoolPub.TitleISchool.value_counts()

Yes    1599
No      714
Name: TitleISchool, dtype: int64

In [73]:
schoolPub.dtypes

NCESSchoolID            object
StateSchoolID           object
NCESDistrictID          object
StateDistrictID         object
LowGrade                object
HighGrade               object
SchoolName              object
District                object
CountyName              object
StreetAddress           object
City                    object
State                   object
ZIP                     object
ZIP4-digit              object
Phone                   object
LocaleCode              object
Locale                  object
Charter                 object
Magnet                 float64
TitleISchool            object
Title1SchoolWide        object
Students                object
Teachers                object
StudentTeacherRatio     object
FreeLunch               object
ReducedLunch            object
dtype: object