# Table of Contents
 <p><div class="lev1 toc-item"><a href="#The-SciPy-Ecosystem" data-toc-modified-id="The-SciPy-Ecosystem-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>The SciPy Ecosystem</a></div><div class="lev1 toc-item"><a href="#Pandas" data-toc-modified-id="Pandas-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pandas</a></div><div class="lev1 toc-item"><a href="#NumPy" data-toc-modified-id="NumPy-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>NumPy</a></div><div class="lev1 toc-item"><a href="#SciPy" data-toc-modified-id="SciPy-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>SciPy</a></div><div class="lev1 toc-item"><a href="#Matplotlib" data-toc-modified-id="Matplotlib-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Matplotlib</a></div>

# The SciPy Ecosystem

 The "SciPy ecosystem" is a collection of Python packages designed to streamline many of the common tasks involved in data science. We'll begin this workshop by taking a look at four of those packages, focusing on the features that I find most important and occasionally highlighting similarities and differences with how things work in R.

- https://www.scipy.org/

# Pandas
Pandas is the top package for manipulating tabular-form data. The most important aspect of Pandas is that it defines a DataFrame class that comes with many convenient methods that R users are used to.

In [None]:
import pandas as pd

data_url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/anscombe.csv"
z = pd.read_csv(data_url) # import data as a DataFrame, analogous to "read.csv(data_url)"



## BASICS

z.head() # analogous to "head(z)"
z.head(20) # analogous to "head(z, 10)"
z.shape # analogous to "dim(z)"
z.columns # analogous to "names(z)"
z.describe() # analogous to "summary(z)"



### SUBSETTING


## retrieving specified columns

z["x"]
z[["x", "y"]]


## retrieving the rows that satisfy a specified condition

z["x"] > 10 # creates a vector of True/False values telling you whether each row's x-val is > 10
z[z["x"] > 10] # selects the rows that have x-val > 10
z[(z["x"] > 10) & (z["dataset"]=="II")] # selects the rows that have x-val > 10 and dataset equal to "II"
z[(z["x"] > 10) | (z["dataset"]=="II")] # selects the rows that have x-val > 10 or dataset equal to "II"



## subsetting by INDEX: iloc

# - single argument retrieves row, two arguments specify row(s) then column(s)

z.iloc[0]
list(z.iloc[0])
z.iloc[:4, :] # rows 0 through 3, all columns
z.iloc[1:4, 1:3] # rows 1 through 3, columns 1 through 2


## subsetting by NAME: loc

# - again, single argument retrieves row, two arguments specify row(s) then column(s)

z.loc[0] # the first row has 0 as its rowname
# But be careful: index names are preserved when new objects are created by subsetting
zz = z.iloc[3:6, :]
zz
zz.iloc[0] # zz has a row with index 0
zz.loc[0] # but its first row isn't named 0
zz.loc[3] # it's name is (still) 3
z.loc[3:6] # also notice that BOTH endpoints are included when using names - very tricky!

z.loc[:4, "x"]
z.loc[:, ["x", "y"]] # same as z[["x", "y"]]



# REMINDER about list pointers and copy()
# Let's see a behavior of lists that can be confusing
zz.loc[3, "x"] = -1
zz
z.head() # it changed the original DataFrame as well!
z.loc[3, "x"] = 9 # let's change it back

zz = z.iloc[3:6, :].copy() # to prevent this issue, we could have explicitly made a copy
zz.loc[3, "x"] = -1 # change the copy
z.head() # check the original
# we've verified that this time, we didn't change the original, only the copy

In [None]:
## EXERCISE: Find the sample standard deviation of the y-values in dataset III.
##           (Remember the "describe" function.)

In [None]:
## CALCULATIONS

# Pandas defines a Series object type; each column of a DataFrame is a Series.
# In fact, a DataFrame is a dictionary
# - each key is a variable name and its value is the corresponding column.
type(z[["x", "y"]])
type(z["x"])

# The head, describe, mean, and additional methods listed here work on Series objects
# or can be easily "applied" to the columns or rows of a DataFrame
z["x"].mean()
z["x"].sum()
z["x"].std()
z["x"].min()
z["x"].abs()
z["x"].median()

# apply to every column or to every row
z.mean() # analogous to "apply(z, 2, mean)"
z.mean(axis=1) # "apply(z, 1, mean)"
# can also use an "apply" method to specify your own function
z[["x", "y"]].apply(sum)
z[["x", "y"]].apply(sum, axis=1)

# easily perform an arithmetic operation to every entry of a Series
z["x"]**2 # "z$x^2" in R
3*z["x"] # "3*z$x" in R
# you can't use this syntax with ordinary Python lists!
# you'd use "list comprehension" instead: [3*i for i in z["x"]]

# multiply two vectors, entry by entry
z["x"]*z["y"]
# dot product of x and y:
sum(z["x"]*z["y"])

# splitting up by a categorical variable
# The "groupby" function in Pandas is analogous to "aggregate" or "split" in R.
z.groupby(z["dataset"]).describe()

# correlation matrix of each group
z.groupby(z["dataset"]).corr()

# You can loop through the groups
for groupname, groupdata in z.groupby("dataset"):
    print(groupname)
    print(groupdata)

for name, group in z.groupby("dataset")["x"]:
    print(name)
    print(group)

In [None]:
## EXERCISE: Find the dot product of x and y within each of the four datasets.

In [None]:
## PLOTTING

z1 = z[z["dataset"]=="I"] # dataset "I"

z1["x"].plot(kind="hist")
# My plot appears in new window (check behind Spyder if you don't see it)
# Note you can save a plot from the Spyder console by right-clicking it
# and selecting "Save Image As..."
# Or if your plot is in a pop-up window, that window has a save button.

z1.plot(kind="scatter", x="x", y="y")

z.groupby(z["dataset"]).plot(kind="scatter", x="x", y="y")

# Pandas only has some basic plotting capabilities.
# For more sophisticated plotting, we'll use Matplotlib below.

# NumPy
Some NumPy functionality is automatically imported with and used by Pandas

In [None]:
import numpy as np

## LINEAR REGRESSION

fit = np.polyfit(z1["x"], z1["y"], 1) # least-squares fit of first-order polynomial (i.e. line)
fit # Careful: this is slope then intercept - reversed order compared to R

# redraw scatterplot with least-squares line
z1.plot(kind="scatter", x="x", y="y").plot(z1["x"], fit[0]*z1["x"] + fit[1], color="Red")

In [None]:
## ARRAYS

a = np.array([1, 3, 2]) # one-dimensional array, i.e. vector
a
np.array((1, 3, 2)) # or putting a tuple rather than list - creates same array
a[:2] # subsetting works as usual for Python lists

b = np.array([a, (4, 5, 6)]) # two-dimensional array, i.e. matrix
b
print(b)

b[:, :2] # subsetting all rows and first two columns
b[0, 1] = 8 # set row 0 column 1 entry to 8
print(b)

b[0] # if only one index is provided, it refers to the row(s) and full column slices are taken
b[-1]

# arithmetic operations apply to each entry of an array
b+1
2*b
b**2
2*b - b # subtracts corresponding entries
# (in fact Pandas Series objects are a type of array - they inherit this behavior)

b.sum(axis=0)
b.sum(axis=1)
b.sum() # adds up all the numbers in the matrix

# more sophisticated vector and matrix operations
a.mean() # an equivalent command is "np.mean(a)"
a.var()
a.argmax()
a.sort() # note that this is an in-place function! it changes the object rather than just returning a sorted version
a

np.dot(b[0], b[1]) # dot product

b.transpose() # equivalently "np.transpose(b)"
np.cov(b) # covariance matrix
b @ b.transpose() # matrix multiplication

np.linalg.svd(b) # singular value decomposition



## MATHEMATICAL FUNCTIONS AND CONSTANTS

np.cos(np.pi)
np.exp(np.log(np.sqrt(25)))


# https://www.numpy.org/devdocs/user/quickstart.html


## EXERCISE: Find the variance of each row of b.

# SciPy 

Note that the SciPy package is one of the many packages in the SciPy ecosystem. The terminology can be confusing!

In [None]:
## LINEAR ALGEBRA

from scipy.linalg import svd, inv

svd(b) # singular value decomposition, agrees with the NumPy function
inv(b @ b.transpose()) # inverse matrix

# https://docs.scipy.org/doc/scipy/reference/linalg.html



## PROBABILITY DISTRIBUTIONS

from scipy.stats import norm # functions for Normal distributions
from scipy.stats import f # functions for F distributions

# cdf and inverse cdf
2*norm.cdf(-2) # twice the probability of a standard Normal draw being less than -2
norm.ppf(.975) # a standard Normal draw has probability .975 of being less than this value

1-f.cdf(2, 1, 12) # probability that a draw from f_{1, 12} is greater than 2


# simulating random samples

norm.rvs(size=10)
f.rvs(1, 12, size=10)

# For more functions and distributions, see
# https://docs.scipy.org/doc/scipy/reference/stats.html

# https://www.tutorialspoint.com/scipy/scipy_introduction.htm

# Matplotlib

In [None]:
import matplotlib.pyplot as plt

zs = z.groupby(z["dataset"])
list(zs) # let's see what we've created

# Create a figure that we will add plots to
fig = plt.figure()
i = 1
for key, zi in zs:
    fit = np.polyfit(zi["x"], zi["y"], 1)
    p = fig.add_subplot(2, 2, i) # put this plot into position i within a 2x2 grid
    p.scatter(x=zi["x"], y=zi["y"])
    p.plot(zi["x"], fit[0]*zi["x"] + fit[1], color="Red")
    i += 1

fig.suptitle("Anscombe's Quartet")