Note: If you haven't already done so, be sure to install pandas and matplotlib.

In [None]:
conda install pandas -y
conda install matplotlib -y

# Let's practice reading in datasets

### What's an import?
To use any package in your code, you must first make it accessible. You have to import it. You can't use anything in Python before it is defined. Some things are built in, for example the basic types (like int, float, etc) can be used whenever you want. But most things you will want to do will need a little more than that. Importing a package (like pandas) makes it accessible in the current scope. Since we used Anaconda to obtain python, we have already installed many of the most popular packages (like `numpy` and `pandas`) and we just have to use `import` to activate them for this session.

In [1]:
# Let's import the pandas package.
import pandas as pd

## Practice reading data files into pandas

In [2]:
# Try this out:
df=pd.read_csv('https://raw.githubusercontent.com/python-machine-learning-apps/intro-to-pandas/main/data/flags.csv')

In [3]:
# Look at the first few lines of data:
df.head()

Unnamed: 0,name,landmass,zone,area,population,language,religion,bars,stripes,colors,...,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
0,Afghanistan,5,1,648,16,10,2,0,3,5,...,0,0,1,0,0,1,0,0,black,green
1,Albania,3,1,29,3,6,6,0,0,3,...,0,0,1,0,0,0,1,0,red,red
2,Algeria,4,1,2388,20,8,2,2,0,3,...,0,0,1,1,0,0,0,0,green,white
3,American-Samoa,6,3,0,0,1,1,0,0,5,...,0,0,0,0,1,1,1,0,blue,red
4,Andorra,3,1,0,0,6,0,3,0,3,...,0,0,0,0,0,0,0,0,blue,red


In [4]:
# And the last few:
df.tail()

Unnamed: 0,name,landmass,zone,area,population,language,religion,bars,stripes,colors,...,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
189,Western-Samoa,6,3,3,0,1,1,0,0,3,...,0,1,5,0,0,0,0,0,blue,red
190,Yugoslavia,3,1,256,22,6,6,0,3,4,...,0,0,1,0,0,0,0,0,blue,red
191,Zaire,4,2,905,28,10,5,0,0,4,...,0,0,0,0,0,1,1,0,green,green
192,Zambia,4,2,753,6,10,5,3,0,4,...,0,0,0,0,0,0,1,0,green,brown
193,Zimbabwe,4,2,391,8,10,5,0,7,5,...,0,0,1,0,1,1,1,0,green,green


In [5]:
# Show a random selection of rows:
df.sample()

Unnamed: 0,name,landmass,zone,area,population,language,religion,bars,stripes,colors,...,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
117,Morocco,4,4,447,20,8,2,0,0,2,...,0,0,1,0,0,0,0,0,red,red


In [6]:
# How big is my dataset?
df.shape

(194, 30)

In [7]:
# What are the columns?
df.columns

Index(['name', 'landmass', 'zone', 'area', 'population', 'language',
       'religion', 'bars', 'stripes', 'colors', 'red', 'green', 'blue', 'gold',
       'white', 'black', 'orange', 'mainhue', 'circles', 'crosses', 'saltires',
       'quarters', 'sunstars', 'crescent', 'triangle', 'icon', 'animate',
       'text', 'topleft', 'botright'],
      dtype='object')

In [8]:
# What type of object is this?
type(df)

pandas.core.frame.DataFrame

In [9]:
# Show me some high-level statistics about my dataframe.
results=df.describe()
results

Unnamed: 0,landmass,zone,area,population,language,religion,bars,stripes,colors,red,...,circles,crosses,saltires,quarters,sunstars,crescent,triangle,icon,animate,text
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,...,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
mean,3.572165,2.21134,700.046392,23.268041,5.340206,2.190722,0.453608,1.551546,3.463918,0.78866,...,0.170103,0.149485,0.092784,0.149485,1.386598,0.056701,0.139175,0.252577,0.201031,0.082474
std,1.553018,1.308274,2170.927932,91.934085,3.496517,2.061167,1.038339,2.328005,1.300154,0.409315,...,0.463075,0.385387,0.290879,0.43586,4.396186,0.231869,0.347025,0.435615,0.401808,0.275798
min,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,1.0,9.0,0.0,2.0,1.0,0.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,2.0,111.0,4.0,6.0,1.0,0.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,4.0,471.25,14.0,9.0,4.0,0.0,3.0,4.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75,0.0,0.0
max,6.0,4.0,22402.0,1008.0,10.0,7.0,5.0,14.0,8.0,1.0,...,4.0,2.0,1.0,4.0,50.0,1.0,1.0,1.0,1.0,1.0


## Now try this with a few other data files from the datasets folder.

In [10]:
# try another csv file:
url = 'https://raw.githubusercontent.com/python-machine-learning-apps/intro-to-pandas/main/data/country-codes.csv'
df1=pd.read_csv(url, usecols=[0,2], header=0, encoding = "ISO-8859-1")
df1.head()

Unnamed: 0,Country,Alpha-3 code
0,Afghanistan,AFG
1,Albania,ALB
2,Algeria,DZA
3,American Samoa,ASM
4,Andorra,AND


In [11]:
# try a tsv file:
url = 'https://raw.githubusercontent.com/python-machine-learning-apps/intro-to-pandas/main/data/eu-govt-bonds.tsv'
df2=pd.read_csv(url, sep="\t")
df2.head()

Unnamed: 0,"int_rt,geo\time",2019M02,2019M03,2019M04,2019M05,2019M06,2019M07,2019M08,2019M09,2019M10,2019M11,2019M12,2020M01
0,"MCBY,AT",0.45,0.38,0.31,0.24,0.03,-0.1,-0.37,-0.3,-0.2,-0.09,-0.04,-0.09
1,"MCBY,BE",0.69,0.54,0.47,0.41,0.15,0.0,-0.28,-0.24,-0.16,-0.04,0.01,-0.03
2,"MCBY,BG",0.68,0.67,0.5,0.48,0.32,0.43,0.35,0.35,0.25,0.22,0.18,0.15
3,"MCBY,CY",2.0,1.74,1.49,1.34,0.82,0.66,0.44,0.48,0.51,0.58,0.57,0.61
4,"MCBY,CZ",1.76,1.82,1.82,1.86,1.58,1.36,0.99,1.24,1.32,1.47,1.51,1.62


Run this code to install the Excel reader:

In [12]:
! conda install openpyxl -y

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.10.3
  latest version: 22.11.1

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /home/studio-lab-user/.conda/envs/default

  added / updated specs:
    - openpyxl


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    et_xmlfile-1.0.1           |          py_1001          11 KB  conda-forge
    openpyxl-3.0.10            |   py39hb9d737c_2         556 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         567 KB

The following NEW packages will be INSTALLED:

  et_xmlfile         conda-forge/noarch::et_xmlfile-1.0.1-py_1001
  openpyxl           conda-forge/linux-64::openpyxl-3.0.10-py39hb9d737c_2



Downloading and Extracting Packages
openpyx

In [13]:
# try an excel file:
df3=pd.read_excel('../data/loans_data_dictionary.xlsx', sheet_name='rejections', header=0)
# pd.set_option("max_colwidth", 100)
df3

Unnamed: 0,RejectStats File,Description
0,Amount Requested,The total amount requested by the borrower
1,Application Date,The date which the borrower applied
2,Loan Title,The loan title provided by the borrower
3,Risk_Score,"For applications prior to November 5, 2013 the..."
4,Debt-To-Income Ratio,A ratio calculated using the borrower’s total ...
5,State,The state provided by the borrower in the loan...
6,Employment Length,Employment length in years. Possible values ar...


In [14]:
# try a json file:
url = 'https://raw.githubusercontent.com/python-machine-learning-apps/intro-to-pandas/main/data/yelp.json'
df4=pd.read_json(url, lines=True)
df4.head()

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw


In [15]:
# Open a file that's saved online
url='https://opendata.arcgis.com/datasets/2e65fc16edc3481989d2cc17e6f8c533_54.csv'
df5 = pd.read_csv(url)
df5.head()

Unnamed: 0,X,Y,OBJECTID,NAME,ALT_NAME,LABEL,ADDRESS,XCOORD,YCOORD,LONGITUDE,LATITUDE,MAR_ID,GIS_ID,GLOBALID,CREATOR,CREATED,EDITOR,EDITED
0,-77.051688,38.959925,1,ROCK CREEK PARK NATURE CENTER AND PLANETARIUM,,Rock Creek Park Nature Center and Planetarium,5200 GLOVER ROAD NW,395520.36,143557.89,-77.051686,38.959917,308551,MuseumPt_1,{B54753A4-D08B-48C2-910A-909E869EEC8D},,,,
1,-77.05262,38.943642,2,HILLWOOD MUSEUM,"HILLWOOD ESTATE, MUSEUM AND GARDENS",Hillwood Museum,4155 LINNEAN AVENUE NW,395438.56,141750.35,-77.052617,38.943634,284839,MuseumPt_2,{1DEC08B3-5911-473F-9BE6-92AC141BB6CE},,,,
2,-77.011771,38.941708,3,LINCOLN'S COTTAGE,ARMED FORCES RETIREMENT HOME BUILDING 12,Lincoln's Cottage,LINCOLN''S COTTAGE,398979.76,141534.42,-77.011768,38.9417,292871,MuseumPt_3,{44FD074D-7AC4-4953-A15F-EBA267A9A1A4},,,,
3,-77.086853,38.9391,4,AMERICAN UNIVERSITY MUSEUM AT THE KATZEN ARTS ...,AMERICAN UNIVERSITY MUSEUM,American University Museum at the Katzen Arts ...,3500 NEBRASKA AVENUE NW,392470.36,141248.41,-77.086851,38.939092,223996,MuseumPt_4,{14F57F1A-D67C-4D1F-A4DD-01D7BFF5E4B4},,,,
4,-77.004667,38.937767,5,THE SAINT JOHN PAUL II NATIONAL SHRINE,,The Saint John Paul II National Shrine,3900 HAREWOOD ROAD NE,399595.57,141096.85,-77.004665,38.937759,288031,MuseumPt_5,{6EEBABB9-F216-4D30-A1DF-4555F92F1ED8},,,,


In [16]:
# How about a datafile that's saved on github?
url="https://raw.githubusercontent.com/austinlasseter/plotly_dash_tutorial/master/00%20resources/titanic.csv"
df5 = pd.read_csv(url, index_col=0)
df5.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,Southampton
1,1,1,female,38.0,71.2833,Cherbourg
2,1,3,female,26.0,7.925,Southampton
3,1,1,female,35.0,53.1,Southampton
4,0,3,male,35.0,8.05,Southampton


In [17]:
# open a tbl file
url = 'https://raw.githubusercontent.com/python-machine-learning-apps/intro-to-pandas/main/data/user.tbl'
df6=pd.read_csv(url, sep='|')
df6.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Save your results as a new csv file

In [18]:
# This will save to your current working directory.
results.to_csv('my_results.csv')

In [19]:
# read it back in.
pd.read_csv('my_results.csv', index_col=0)

Unnamed: 0,landmass,zone,area,population,language,religion,bars,stripes,colors,red,...,circles,crosses,saltires,quarters,sunstars,crescent,triangle,icon,animate,text
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,...,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
mean,3.572165,2.21134,700.046392,23.268041,5.340206,2.190722,0.453608,1.551546,3.463918,0.78866,...,0.170103,0.149485,0.092784,0.149485,1.386598,0.056701,0.139175,0.252577,0.201031,0.082474
std,1.553018,1.308274,2170.927932,91.934085,3.496517,2.061167,1.038339,2.328005,1.300154,0.409315,...,0.463075,0.385387,0.290879,0.43586,4.396186,0.231869,0.347025,0.435615,0.401808,0.275798
min,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,1.0,9.0,0.0,2.0,1.0,0.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,2.0,111.0,4.0,6.0,1.0,0.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,4.0,471.25,14.0,9.0,4.0,0.0,3.0,4.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75,0.0,0.0
max,6.0,4.0,22402.0,1008.0,10.0,7.0,5.0,14.0,8.0,1.0,...,4.0,2.0,1.0,4.0,50.0,1.0,1.0,1.0,1.0,1.0
