In [1]:
import pandas as pd

In [2]:
d1 = {'url': 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
      'name':'iris',
      'load_func': lambda path: pd.read_csv(path)}
d2 = {'url': 'https://raw.githubusercontent.com/brownsarahm/python-socialsci-files/master/data/SAFI.json',
     'name':'safi',
     'load_func': lambda path: pd.read_json(path)}
d3 = {'url':'https://rhodyprog4ds.github.io/BrownFall20/syllabus/grading.html',
     'name': 'min_acheivements',
     'load_func': lambda path: pd.read_html(path)[0]}

dataset_list = [d1,d2,d3]

In [5]:
# test the dicts
d1['load_func'](d1['url'])

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [8]:
# test spitting
d1['url'].split('/')[-1]

'iris.data'

In [10]:
data_info = []
column_names = ['name','source','num_rows', 'num_columns','source_file_name']

for dataset in dataset_list:
    df = dataset['load_func'](dataset['url'])
    source_file_name = dataset['url'].split('/')[-1]
    num_rows, num_cols = df.shape
    data_info.append([dataset['name'],dataset['url'],num_rows,
                      num_cols,source_file_name])
    
summary_df = pd.DataFrame(data_info,columns=column_names)
summary_df

Unnamed: 0,name,source,num_rows,num_columns,source_file_name
0,iris,https://archive.ics.uci.edu/ml/machine-learnin...,149,5,iris.data
1,safi,https://raw.githubusercontent.com/brownsarahm/...,131,74,SAFI.json
2,min_acheivements,https://rhodyprog4ds.github.io/BrownFall20/syl...,10,4,grading.html


In [11]:
summary_df.to_csv('dataset_summary.csv')

# Exploring Safi

In [12]:
d2

{'url': 'https://raw.githubusercontent.com/brownsarahm/python-socialsci-files/master/data/SAFI.json',
 'name': 'safi',
 'load_func': <function __main__.<lambda>(path)>}

Load it into a data frame

In [13]:
safi_df = d2['load_func'](d2['url'])

Show the last 7 rows

In [14]:
safi_df.tail()

Unnamed: 0,C06_rooms,B19_grand_liv,A08_ward,E01_water_use,B18_sp_parents_liv,B16_years_liv,E_yes_group_count,F_liv,_note2,instanceID,...,observation,_note,A12_agr_assoc,G03_no_food_mitigation,F05_money_source_other,gps:Latitude,E_no_group,F14_items_owned_other,F08_emply_lab,_members_count
126,1,yes,ward2,yes,yes,7,2.0,"[{'F11_no_owned': 2, 'F_curr_liv': 'oxen'}, {'...",,uuid:69caea81-a4e5-4e8d-83cd-9c18d8e8d965,...,Ponto geogrÃ¡fico \n\nLatitude: 0512603\nLonge...,,yes,"[rely_less_food, lab_ex_food]",,-19.112194,[],,yes,3
127,3,no,ward2,yes,no,10,4.0,"[{'F11_no_owned': 5, 'F_curr_liv': 'oxen'}, {'...",,uuid:5ccc2e5a-ea90-48b5-8542-69400d5334df,...,O senhor Tome Florindo tem 2 canais de agua e ...,,no,[na],,-19.112157,[],,no,7
128,1,yes,ward2,yes,no,5,2.0,"[{'F11_no_owned': 5, 'F_curr_liv': 'poultry'}]",,uuid:95c11a30-d44f-40c4-8ea8-ec34fca6bbbf,...,Ponto geogrÃ¡fico\n\nLatitude: 0515372\nLonget...,,no,[lab_ex_food],,-19.112271,[],,no,4
129,2,no,ward2,yes,no,17,2.0,"[{'F11_no_owned': 1, 'F_curr_liv': 'oxen'}, {'...",,uuid:ffc83162-ff24-4a87-8709-eff17abc0b3b,...,Ponto geogrÃ¡fico\n\nLatitude: 0512586\nLongit...,,yes,"[rely_less_food, limit_variety, reduce_meals, ...",,-19.112278,[],,yes,7
130,2,no,ward2,no,yes,20,,"[{'F11_no_owned': 3, 'F_curr_liv': 'oxen'}, {'...",,uuid:aa77a0d7-7142-41c8-b494-483a5b68d8a7,...,Ponto geogrÃ¡fico\n\nLatitude: 0512624\nLonget...,,yes,"[rely_less_food, restrict_adults, borrow_food]",,-19.112183,"[{'E04_res_water_field': 'land_far', 'E03_crop...",,no,8


## Two ways to pick out the non numerical columns

In [20]:
non_num_cols = safi_df.columns[safi_df.dtypes =='object']
non_num_cols

Index(['B19_grand_liv', 'A08_ward', 'E01_water_use', 'B18_sp_parents_liv',
       'F_liv', 'instanceID', 'B20_sp_grand_liv', 'F12_poultry',
       'C02_respondent_wall_type', '_remitters', 'E18_months_no_water',
       'F07_use_income', 'E17_no_enough_water', 'F04_need_money', 'A05_end',
       'C04_window_type', 'E21_other_meth', 'F05_money_source', 'A07_district',
       'C03_respondent_floor_type', 'E_yes_group', 'A01_interview_date',
       'B11_remittance_money', 'A04_start', 'D_plots', 'F_items',
       'F10_liv_owned', 'F13_du_look_aftr_cows', 'E26_affect_conflicts',
       'F14_items_owned', 'F06_crops_contr', 'B17_parents_liv',
       'G02_months_lack_food', 'F09_du_labour', 'E22_res_change',
       'E24_resp_assoc', '_members', 'A06_province', 'E20_exper_other',
       'A09_village', 'C01_respondent_roof_type', 'E23_memb_assoc',
       'E25_fees_water', 'C07_other_buildings', 'observation', 'A12_agr_assoc',
       'G03_no_food_mitigation', 'E_no_group', 'F08_emply_lab'],
    

In [23]:
nn_cols = [col for col in safi_df.columns if safi_df[col].dtypes =='object']
nn_cols

['B19_grand_liv',
 'A08_ward',
 'E01_water_use',
 'B18_sp_parents_liv',
 'F_liv',
 'instanceID',
 'B20_sp_grand_liv',
 'F12_poultry',
 'C02_respondent_wall_type',
 '_remitters',
 'E18_months_no_water',
 'F07_use_income',
 'E17_no_enough_water',
 'F04_need_money',
 'A05_end',
 'C04_window_type',
 'E21_other_meth',
 'F05_money_source',
 'A07_district',
 'C03_respondent_floor_type',
 'E_yes_group',
 'A01_interview_date',
 'B11_remittance_money',
 'A04_start',
 'D_plots',
 'F_items',
 'F10_liv_owned',
 'F13_du_look_aftr_cows',
 'E26_affect_conflicts',
 'F14_items_owned',
 'F06_crops_contr',
 'B17_parents_liv',
 'G02_months_lack_food',
 'F09_du_labour',
 'E22_res_change',
 'E24_resp_assoc',
 '_members',
 'A06_province',
 'E20_exper_other',
 'A09_village',
 'C01_respondent_roof_type',
 'E23_memb_assoc',
 'E25_fees_water',
 'C07_other_buildings',
 'observation',
 'A12_agr_assoc',
 'G03_no_food_mitigation',
 'E_no_group',
 'F08_emply_lab']

In [24]:
list(non_num_cols) == nn_cols

True

Sentences about the format of the data

## Exploring Iris

In [25]:
d1

{'url': 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
 'name': 'iris',
 'load_func': <function __main__.<lambda>(path)>}

In [27]:
iris_df = d1['load_func'](d1['url'])

In [28]:
iris_df.head(3)

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa


In [29]:
iris_df.dtypes

5.1            float64
3.5            float64
1.4            float64
0.2            float64
Iris-setosa     object
dtype: object

All of these look like the datatype I expected

## Exploring Minimum achievments

In [31]:
d3

{'url': 'https://rhodyprog4ds.github.io/BrownFall20/syllabus/grading.html',
 'name': 'min_acheivements',
 'load_func': <function __main__.<lambda>(path)>}

In [32]:
min_acheivements_df = d3['load_func'](d3['url'])

In [33]:
min_acheivements_df.columns

MultiIndex([('Unnamed: 0_level_0',       'letter grade'),
            (           'Level 3', 'Unnamed: 1_level_1'),
            (           'Level 2', 'Unnamed: 2_level_1'),
            (           'Level 1', 'Unnamed: 3_level_1')],
           )

In [45]:
min_acheivements_df[[('Level 3', 'Unnamed: 1_level_1'),
            ('Level 2', 'Unnamed: 2_level_1'),
                     ('Level 1', 'Unnamed: 3_level_1')]]

Unnamed: 0_level_0,Level 3,Level 2,Level 1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15,15,15
1,10,15,15
2,5,15,15
3,0,15,15
4,0,10,15
5,0,5,15
6,0,0,15
7,0,0,10
8,0,0,5
9,0,0,3


In [48]:
min_acheivements_df[[('Level 3', 'Unnamed: 1_level_1'),
            ('Level 2', 'Unnamed: 2_level_1'),
                     ('Level 1', 'Unnamed: 3_level_1')]].loc[:10:2]

Unnamed: 0_level_0,Level 3,Level 2,Level 1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15,15,15
2,5,15,15
4,0,10,15
6,0,0,15
8,0,0,5


## Scratch

In [49]:
[type(d) for d in d1.items()]

[tuple, tuple, tuple]

In [50]:
opt1 = [char for char in 'abcde']
opt2 = {char:i for i, char in enumerate('abcde')}
opt3 = ('a','b','c','d','e')
opt4 =  'a b c d e'.split(' ')

In [51]:
options = [opt1, opt2, opt3, opt4]

for i,op in enumerate(options):
    print(i+1,': ',op)

['a', 'b', 'c', 'd', 'e']
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}
('a', 'b', 'c', 'd', 'e')
['a', 'b', 'c', 'd', 'e']


In [54]:
options[[type(op)==dict for op in options]]

TypeError: list indices must be integers or slices, not list