<a href="https://colab.research.google.com/github/austinlasseter/colab_intro/blob/master/colab_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Five ways to read your data into a colab notebook

[Keyboard shortcuts for colab](https://medium.com/@tuewithmorris/google-colab-notebooks-keyboard-shortcuts-aa6a008fb91b)

In [0]:
import pandas as pd
import numpy as np

#### Method 1. Read data directly from a URL.

In [0]:
# this is the simplest way to read in data
# source: # https://opendata.dc.gov/datasets/bike-trails/data
bikes=pd.read_csv('https://opendata.arcgis.com/datasets/e8c2b7ef54fb43d9a2ed1b0b75d0a14d_4.csv') 
bikes.head()

Unnamed: 0,OBJECTID,LENGTH,NAME,STATUS,MAINTENANC,Shape_Length,MILES,ROUTEID
0,1,827.476,National Mall Trails,Open,NPS,252.214952,0.0,
1,2,19225.404,Capital Crescent Trail,Open,,5859.91494,0.0,
2,3,25404.687,Rock Creek Trail,Open,NPS,7743.36424,0.0,
3,4,6201.712,Metropolitan Branch Trail,Open,DDOT,1890.28563,0.0,
4,5,12625.091,Watts Branch Trail,Open,DDOT,3848.135455,0.0,


In [0]:
# Sometimes you have to supply the column heads.
column_headers = ['name', 'landmass', 'zone', 'area', 'population', 'language', 
                  'religion', 'bars', 'stripes', 'colours', 'red', 'green', 
                  'blue', 'gold', 'white', 'black', 'orange', 'mainhue', 
                  'circles', 'crosses', 'saltires', 'quarters', 'sunstars', 
                  'crescent', 'triangle', 'icon', 'animate', 'text', 'topleft', 
                  'botright']
flag_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data'
flags = pd.read_csv(flag_data_url, header=None, names=column_headers)
print(flags.shape)
flags.head()

In [0]:
# Let's try another example.
url='https://opendata.arcgis.com/datasets/2e65fc16edc3481989d2cc17e6f8c533_54.csv'
museums=pd.read_csv(url)
museums.sample(2)
# source: https://opendata.dc.gov/datasets/museums-in-dc/data

Unnamed: 0,X,Y,OBJECTID,NAME,ALT_NAME,LABEL,MAR_MATCHADDRESS,MAR_XCOORD,MAR_YCOORD,MAR_LONGITUDE,MAR_LATITUDE,MARID
80,-77.030866,38.908176,81,MARY BETHUNE COUNCIL HOUSE,BETHUNE MEMORIAL MUSEUM,Mary Bethune Council House,1318 VERMONT AVENUE NW,397323.1,137812.47,-77.030863,38.908168,225385
0,-77.009038,38.889802,1,EXHIBITION HALL AT THE U.S. CAPITOL VISITOR CE...,U.S. CAPITOL VISITOR CENTER,U.S. Capitol Visitor Center,,399216.07,135772.41,-77.009036,38.889794,294394


### Method 2. Read a file that is already in colab.

In [0]:
# So colab comes with some built in files.
cali = pd.read_csv('sample_data/california_housing_train.csv')
cali.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0


In [0]:
# you can also read a json file. Notice the difference in pandas method.
anscombe=pd.read_csv('sample_data/anscombe.json')
anscombe=pd.read_json('sample_data/anscombe.json')
anscombe.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,[
"{""Series"":""I""","""X"":10.0","""Y"":8.04}",
"{""Series"":""I""","""X"":8.0","""Y"":6.95}",
"{""Series"":""I""","""X"":13.0","""Y"":7.58}",
"{""Series"":""I""","""X"":9.0","""Y"":8.81}",
"{""Series"":""I""","""X"":11.0","""Y"":8.33}",


In [0]:
# Learn how to navigate your server 

from pathlib import Path
Path.cwd()


PosixPath('/content')

In [0]:
# What's the parent dir?
home = Path.cwd()
print(home.parent)
data_dir = Path.joinpath(home, 'sample_data')
data_dir

/


PosixPath('/content/sample_data')

In [0]:
# List the contents of that folder
import os
os.listdir(data_dir)

['anscombe.json',
 'README.md',
 'california_housing_test.csv',
 'mnist_train_small.csv',
 'california_housing_train.csv',
 'mnist_test.csv']

In [0]:
# Use that to make a list of files.
files = list(os.listdir(data_dir))
files[2]

'california_housing_test.csv'

In [0]:
# Now you can read that into colab using pandas
file_path = Path.joinpath(data_dir, files[2])
print(file_path)
df=pd.read_csv(file_path)
df.head()

/content/sample_data/california_housing_test.csv


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


### Method 3. Upload a file to colab.
Reminder, uploaded files will get deleted when this runtime is recycled.

In [0]:
# You can also use the manual upload GUI over on the left of your screen.
from google.colab import files
uploaded = files.upload()

Saving abalone.csv to abalone.csv


In [0]:
df = pd.read_csv('abalone.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


###  Method 4. Read a file that's saved on github
https://github.com/austinlasseter/dash-virginia-counties

In [0]:
# useful for reading raw data from github.
url='https://raw.githubusercontent.com/austinlasseter/dash-virginia-counties/master/resources/acs2017_county_data.csv'
va=pd.read_csv(url)
print(va.shape)
va.describe()
# Source: https://github.com/austinlasseter/dash-virginia-counties/blob/master/resources/acs2017_county_data.csv

### Method 5. Read a zip file.

In [0]:
# use the 'bang' for bash shell scripting.
! pwd
! ls

/content
abalone.csv  acs2015_county_data.csv  LoanStats_2018Q4.csv.zip	sample_data


In [0]:
# the 'wget' command reads content from the web.
!wget https://resources.lendingclub.com/LoanStats_2018Q4.csv.zip

--2019-10-31 17:19:57--  https://resources.lendingclub.com/LoanStats_2018Q4.csv.zip
Resolving resources.lendingclub.com (resources.lendingclub.com)... 64.48.1.20
Connecting to resources.lendingclub.com (resources.lendingclub.com)|64.48.1.20|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘LoanStats_2018Q4.csv.zip’

LoanStats_2018Q4.cs     [        <=>         ]  21.62M   425KB/s    in 53s     

2019-10-31 17:20:50 (419 KB/s) - ‘LoanStats_2018Q4.csv.zip’ saved [22667400]



In [0]:
# how do you unzip a .zip file in bash?
!unzip LoanStats_2018Q4.csv.zip
!ls

Archive:  LoanStats_2018Q4.csv.zip
  inflating: LoanStats_2018Q4.csv    
abalone.csv		 LoanStats_2018Q4.csv	   sample_data
acs2015_county_data.csv  LoanStats_2018Q4.csv.zip


In [78]:
# Now you can read that csv file into pandas.
loans = pd.read_csv('LoanStats_2018Q4.csv', skiprows=1, skipfooter=2, engine='python')
loans.shape

(128412, 144)

In [0]:
## Footnote: You can also save files to csv (or other formats like pickle)
loans.to_pickle('loans2.pkl')
new=pd.read_pickle('loans2.pkl')
new.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80,Unnamed: 81,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91,Unnamed: 92,Unnamed: 93,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110,Unnamed: 111,Unnamed: 112,Unnamed: 113,Unnamed: 114,Unnamed: 115,Unnamed: 116,Unnamed: 117,Unnamed: 118,Unnamed: 119,Unnamed: 120,Unnamed: 121,Unnamed: 122,Unnamed: 123,Unnamed: 124,Unnamed: 125,Unnamed: 126,Unnamed: 127,Unnamed: 128,Unnamed: 129,Unnamed: 130,Unnamed: 131,Unnamed: 132,Unnamed: 133,Unnamed: 134,Unnamed: 135,Unnamed: 136,Unnamed: 137,Unnamed: 138,Unnamed: 139,Unnamed: 140,Unnamed: 141,Unnamed: 142,Notes offered by Prospectus (https://www.lendingclub.com/info/prospectus.action)
id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
,,20000,20000,20000,36 months,14.47%,688.13,C,C2,bus driver,4 years,OWN,52000,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,681xx,NE,30.65,1,Jun-1979,2,15,,6,0,15048,73%,22,w,15777.74,15777.74,6104.74,6104.74,4222.26,1882.48,0.0,0.0,0.0,Sep-2019,688.13,Oct-2019,Sep-2019,0,,1,Individual,,,,0,0,33157,0,2,1,1,8,18109,44,1,2,8628,73,20700,1,1,4,3,5526,5175,73,0,0,141,474,10,8,0,10,15,1,15,0,4,4,11,12,8,4,14,4,6,,0,0,2,95,50,0,0,61699,33157,20700,40999,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
,,25000,25000,25000,60 months,16.14%,609.82,C,C4,Production Manager,5 years,MORTGAGE,45000,Not Verified,Dec-2018,Fully Paid,n,,,debt_consolidation,Debt consolidation,703xx,LA,37.09,0,Sep-2003,0,,,7,0,8901,36.8%,21,w,0.00,0.00,26653.1675796436,26653.17,25000.00,1653.17,0.0,0.0,0.0,Apr-2019,24857.33,,May-2019,0,,1,Individual,,,,0,1303,49524,0,2,1,2,7,40623,82,0,0,7830,67,24200,2,0,1,2,7075,10465,43.7,0,0,161,162,45,7,4,96,,7,,0,2,3,3,5,7,5,10,3,7,0,0,0,1,100,33.3,0,0,73683,49524,18600,49483,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
,,26500,26500,26500,60 months,11.31%,580.28,B,B3,Compliance Director,10+ years,MORTGAGE,134000,Source Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,747xx,OK,18.91,0,Dec-2006,2,,,17,0,43640,65.4%,37,w,23410.70,23410.70,5197.54,5197.54,3089.30,2108.24,0.0,0.0,0.0,Sep-2019,580.28,Oct-2019,Sep-2019,0,,1,Individual,,,,0,0,381100,3,3,2,2,4,64335,46,1,3,10115,58,66700,2,1,7,6,22418,14577,70.4,0,0,114,144,3,3,4,22,,2,,0,8,10,9,10,7,13,26,10,17,0,0,0,4,100,66.7,0,0,430403,107975,49300,88875,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
,,10400,10400,10400,36 months,12.98%,350.32,B,B5,Program Support Assistant,10+ years,MORTGAGE,56099,Source Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,800xx,CO,20.92,0,Jul-2013,2,32,67,8,1,1669,9.8%,10,w,8164.57,8164.57,3141.63,3141.63,2235.43,906.20,0.0,0.0,0.0,Sep-2019,350.32,Oct-2019,Sep-2019,0,,1,Individual,,,,0,0,39564,4,2,1,2,1,37895,92,3,6,725,36,17000,1,0,4,8,4946,15331,9.8,0,0,65,57,1,1,0,1,,0,32,0,3,3,6,6,3,6,7,3,8,0,0,0,4,90,0,1,0,60709,39564,17000,43709,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,


In [0]:
# You can also save to csv, but remember to skip the index.
cali.to_csv('cali2.csv', index=False)
new=pd.read_csv('cali2.csv')
new.shape

(17000, 9)