# Demo: Pandas

## Two important datatypes in Pandas 
* Series (like a vector or array)
* Dataframe (like a 2-D array or Excel spreadsheet)

In [None]:
import pandas as pd

population_dict = {
    'California': 38_332_521,
         'Texas': 26_448_193,
      'New York': 19_651_127,
       'Florida': 195_528_60,
      'Illinois': 12_882_135
}
# create a series from a Python dict
population = pd.Series(population_dict)
population

In [None]:
area_dict = {
    'California': 423_967, 
         'Texas': 695_662,
      'New York': 141_297,
       'Florida': 170_312,
      'Illinois': 149_995
}
area = pd.Series(area_dict)
area

In [None]:
# Create a DataFrame from two dicts–each will 
# be a column in the new DataFrame.
states = pd.DataFrame({'population': population, 'area': area})
states
# Note that print(states) doesn't look as nice. 
# That's because just typing 'states' as above invokes 
# the display() function for DataFrames:
# from IPython.display import display
#
# display(states)

In [None]:
# DataFrames have an index that we can inspect (or change)
states.index

In [None]:
# View column names
states.columns

In [None]:
# View a specific column
states['area'] # or states.area

In [None]:
# Generate a Boolean series based on a Boolean condition
# e.g., Which states have an area > 150,000 sq. miles?
 # or, Which states have a population > 20,000,000
large_area = states['area'] > 150_000 
large_area

In [None]:
large_pop = states['population'] > 20_000_000
# We can use the & operator (bitwise AND) to combine conditions
states[large_area & large_pop]

In [None]:
# We can see that under the hood, the values in a DataFrame
# are represented as a matrix or 2-D array
states.values

## Reading CSV files into __`pandas`__

In [None]:
# Read data from a CSV file
data = pd.read_csv('data/skincancer.csv')

In [None]:
# Show the first n rows, default = 5
data.head(10)

In [None]:
# Show the "shape" of the data, i.e., rows x columns
data.shape

## Don't treat first line as header

In [None]:
data = pd.read_csv('data/skincancer.csv', header=None)

In [None]:
data.head()

In [None]:
data.shape

## Specify our own headers/column names

In [None]:
# We saw earlier that we can view the column names.
# We can also change them!
data.columns = 'State Latitude Mortality Ocean Longitude'.split()

In [None]:
data.head()