# Numpy, Pandas, and Matplotlib Example

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (20,10)
plt.rcParams["font.size"] =14

### Practice with numpy

Let's get started with generating some example arrays.

In [3]:
A = np.arange(0,10)
print("A:", A)

B = np.arange(0,100,10)
print("B:", B)

C = np.linspace(0,1,9)
print("C:", C)

np.random.seed(0)
D = np.random.random(10)
print("D:",D)

E = np.random.randint(100, size=(20))
print("E:",E)

F = np.random.random((5,5))
print("F:",F)

A: [0 1 2 3 4 5 6 7 8 9]
B: [ 0 10 20 30 40 50 60 70 80 90]
C: [0.    0.125 0.25  0.375 0.5   0.625 0.75  0.875 1.   ]
D: [0.5488135  0.71518937 0.60276338 0.54488318 0.4236548  0.64589411
 0.43758721 0.891773   0.96366276 0.38344152]
E: [39 87 46 88 81 37 25 77 72  9 20 80 69 79 47 64 82 99 88 49]
F: [[0.46147936 0.78052918 0.11827443 0.63992102 0.14335329]
 [0.94466892 0.52184832 0.41466194 0.26455561 0.77423369]
 [0.45615033 0.56843395 0.0187898  0.6176355  0.61209572]
 [0.616934   0.94374808 0.6818203  0.3595079  0.43703195]
 [0.6976312  0.06022547 0.66676672 0.67063787 0.21038256]]


Let's get some information about the size/shape/dimension of the data we generated.

In [4]:
print("Shape E:", E.shape)
print("Shape F:", F.shape)
print("Size E:", E.size)
print("Size F:", F.size)
print("Ndim E:", E.ndim)
print("Ndim F:", F.ndim)

Shape E: (20,)
Shape F: (5, 5)
Size E: 20
Size F: 25
Ndim E: 1
Ndim F: 2


Let's practice indexing into and slicing the array.

In [9]:
print(A)
print(A[:5]) #up to the fifth index
print(A[-1]) #gets you the first from the end
print(A[:-1]) #from beginning but leave off last one from end
print(A[1:-1]) #leave off first and last
print(A[::2]) #two at a time
print(A[::-1]) #reverse it

[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4]
9
[0 1 2 3 4 5 6 7 8]
[1 2 3 4 5 6 7 8]
[0 2 4 6 8]
[9 8 7 6 5 4 3 2 1 0]


Let's now calculate with some data: addition, subtraction, multiplication, division, scalar multiplication, and statistics.

In [14]:
X = np.random.random(10)
Y = np.random.random(10)
print(X)
print(Y)
print('\n')
print(X+Y)
print(X*Y)
print(np.mean(X), np.std(X), np.min(X), np.max(X))

[0.30157482 0.66017354 0.29007761 0.61801543 0.4287687  0.13547406
 0.29828233 0.56996491 0.59087276 0.57432525]
[0.65320082 0.65210327 0.43141844 0.8965466  0.36756187 0.43586493
 0.89192336 0.80619399 0.70388858 0.10022689]


[0.95477564 1.31227681 0.72149604 1.51456202 0.79633057 0.57133899
 1.19020568 1.3761589  1.29476134 0.67455214]
[0.19698892 0.43050132 0.12514483 0.55407963 0.15759903 0.05904839
 0.26604497 0.45950228 0.41590859 0.05756283]
0.44675294022997847 0.17087742394752628 0.13547406422245023 0.660173537492685


### Reading in the data

Our first step is to read in the CSV file that contains the population data we're interested in working with.

In [33]:
# Read in CSV here
pop_df = pd.read_csv("data-USstates-master/data-USstates-master/state-population.csv")
print(pop_df)

     state/region     ages  year   population
0              AL  under18  2012    1117489.0
1              AL    total  2012    4817528.0
2              AL  under18  2010    1130966.0
3              AL    total  2010    4785570.0
4              AL  under18  2011    1125763.0
...           ...      ...   ...          ...
2539          USA    total  2010  309326295.0
2540          USA  under18  2011   73902222.0
2541          USA    total  2011  311582564.0
2542          USA  under18  2012   73708179.0
2543          USA    total  2012  313873685.0

[2544 rows x 4 columns]


We can see information about the data file through both info() and describe().

In [34]:
print(pop_df.info())
print(pop_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2544 entries, 0 to 2543
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state/region  2544 non-null   object 
 1   ages          2544 non-null   object 
 2   year          2544 non-null   int64  
 3   population    2524 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 79.6+ KB
None
              year    population
count  2544.000000  2.524000e+03
mean   2001.500000  6.805558e+06
std       6.923547  2.855014e+07
min    1990.000000  1.013090e+05
25%    1995.750000  7.423805e+05
50%    2001.500000  1.597005e+06
75%    2007.250000  4.547104e+06
max    2013.000000  3.161288e+08


For convenience sake, we want to rename the column state/region to abbreviation.

In [39]:
# Rename column
pop_df.rename(columns={"state/region":"abbreviation"})

Unnamed: 0,abbreviation,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0
...,...,...,...,...
2539,USA,total,2010,309326295.0
2540,USA,under18,2011,73902222.0
2541,USA,total,2011,311582564.0
2542,USA,under18,2012,73708179.0


### Down-selecting the Data

Suppose we want to visualize how the population of Tennessee has changed based on this dataset. First, we want to select only the data associated with Tennessee and sort it based on the year.  We will create a new dataframe with only the Tennessee data. We can then sort by the year.

In [32]:
# Down-select data
tn_df = pop_df[pop_df["abbreviation"] == "TN"]
tn_df.sort_values(by="year", inplace=True)
print(tn_df)

KeyError: 'abbreviation'

### Extracting Numpy Arrays from Dataframes

Suppose we want to plot how the under 18 population and the total population has changed throughout this dataset.  We can extract the year, the under 18 population, and the total population into three separate numpy arrays.

In [31]:
# Extra Numpy Array
years = np.array(tn_df[tn_df["ages"] == "total"]["year"])
under18 = np.array(tn_df[tn_df["ages"] == "under18"]["population"])
total = np.array(tn_df[tn_df["ages"] == "total"]["population"])

print(total)

NameError: name 'tn_df' is not defined

## Plotting a Line Plot

Now we're ready to plot how the populations are changing in a line plot.

In [None]:
# Plot line plot. Don't forget axis labels!
plt.plot(years, total, label="Total")
plt.plot(years, under18, label="Under 18")
plt.xlabel("Years")
plt.ylabel("Population Size")
plt.legend()
plt.show()

### Plotting a Bar Chart

We can visualize this same information via a bar chart.

In [None]:
# Bar chart
plt.bar(years, total, label="Total")
plt.bar(years, under18, label="Under 18")
plt.xlabel("Years")
plt.ylabel("Population Size")
plt.legend()
plt.show()

### Manipulating Data with Numpy

Let's use numpy to calculate the fraction of the total population that is under 18 each year.

In [None]:
# Calculate and plot fraction under 18
frac_under18 [ under18 / total
print(frac_under18)
plt.bar(years, frac_under18)

### Reading multiple CSV files into different data frames

In [37]:
# Read in new CSVs into different data frames
abbrev_df = pd.read_csv("data-USstates-master/data-USstates-master/state-abbrevs.csv")
area_df = pd.read_csv("data-USstates-master/data-USstates-master/state-areas.csv")

### Merging multiple dataframes together

If there are common columns across dataframes, you can merge them together.

In [40]:
# Merge data frames together
cdf = pd.merge(pop_df, abbrev_df)
cdf = pd.merge(cdf, area_df)
print(cdf)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

### Downselecting based on Multiple Conditions

We can use multiple conditions to downselect in a dataframe. For example, suppose we only want the total population for each state in the year 2013.

In [None]:
# Down-select with multiple conditions
df = cdf[(cdf["ages"] == "total") & (cdf["year"] == 2013)]

### Histograms

Suppose we want to find out what the histogram of state populations looks like in 2013.

In [None]:
# Histogram of state populations in 2013


### Finding Unique Elements

We can use numpy and unique to find the array of unique elements from a numpy array.

In [None]:
# Find unique list of states


### Boxplots

Suppose we want to see how much populations have varied in states in this dataset.  Boxplots can be a convenient plotting tool.  Using the unique list of state abbreviations, we can find all of the populations for each state and create a boxplot.

In [None]:
# Extract populations for each state and create the boxplots


### Scatter Plots

We can visualize the population of the state in 2013 vs. the area of the state using scatter plots.

In [None]:
# Create the scatter plot of population vs. area and annotate the points


### What is a question would you like to answer based on the full combined dataset?