# Introduction to Data Analysis with Python


<img src="https://www.python.org/static/img/python-logo.png" alt="yogen" style="width: 200px; float: right;"/>
<br>
<br>
<br>
<img src="../assets/yogen-logo.png" alt="yogen" style="width: 200px; float: right;"/>

# Objectives

* Handle linear algebra using `numpy`

* Handle tabular data with `pandas`

# The Python scientific stack: SciPy

Python Main Data Libraries

NumPy: Base N-dimensional array package

SciPy library: Fundamental library for scientific computing

Matplotlib: Comprehensive 2D Plotting

IPython: Enhanced Interactive Console

Sympy: Symbolic mathematics

pandas: Data structures & analysis

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

## `matplotlib`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

#matplotlib.style.use('ggplot')
%matplotlib inline

In [None]:
import matplotlib as plt2

In [None]:
plt.plot([1,2,3], [4,5,7]);

In [None]:
X = np.linspace(-np.pi, np.pi, 25)
C = np.cos(X)
S = np.sin(X)
plt.plot(X,S,"r.-");
plt.plot(X,C,"b.-");
plt.grid(True)
#plt.show()

In [None]:
plt.scatter(X, C, c='r');
plt.scatter(X, S,c='b');

## `numpy`

Base N-dimensional array package. We can use it to perform array operations. Its arrays are typed and can have any shape and dimension we want.

In [None]:
np.version.version

In [None]:
a_list = [1,2.0,3.1,4.2,-1,8.92]

In [None]:
type(a_list)

In [None]:
a_list = [1, 2.0,3.1,4.2,-1,8.92]
a = np.array(a_list)
a.dtype

In [None]:
a.shape

In [None]:
a.ndim

In [None]:
ej = np.array( [ [1,2+1.j], [3,4]])

In [None]:
ej

In [None]:
a2 = np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]])
print(a2)

print(a.shape, a2.shape)
print(a.size,a2.size)

There are a number of convenience functions to create particular special arrays.

In [None]:
np.zeros((4,2))

In [None]:
np.ones((4,3))

In [None]:
np.ones_like(a2)

In [None]:
np.eye(4)

In [None]:
kk = np.empty((4,2))

In [None]:
kk

In [None]:
print(np.zeros(10))
print(np.ones((2,3)))
print(np.ones_like(a2))
print(np.eye(4))
print(np.empty((2,3,2)))

In [None]:
np.arange(1,20,2)

In [None]:
np.linspace(1,20,10)

In [None]:
print(np.arange(10))
print(np.arange( 10, 30, 5 ))
print(np.arange( 0, 2 * np.math.pi, 0.3 ))
print(np.linspace( 0, 2 * np.math.pi, 15 ))

A central advantage of numpy over regular python lists, apart from the many convenience functions, is speed.

In [None]:
narray = np.arange(100000)
plist = range(100000)

In [None]:
import math

In [None]:
%%timeit
[math.sqrt(n) for n in plist]

In [None]:
%%timeit
np.sqrt(narray)

In [None]:
np.arange(10000)
#.reshape(100,100)

In [None]:
np.arange(10000).reshape(100,100)

And arrays of random variables.

In [None]:
help(np.random.randn)

In [None]:
np.random.randn(9).reshape(3,3)

In [None]:
np.random.randn(3,3)

In [None]:
np.random.rand(10,2)

#### Exercise

Create a sample of points that follow the equation $Y = AX + B$, where A = 2.5 and B = 20.

Now, plot it as either a cloud of points or a line.

Defining starting variables:

In [None]:
n_elements = 100
A = 5.0
B = 20
origin_list = [0,10,-10]
X1 = np.random.rand(n_elements)
X2 = np.linspace(0,1,n_elements)
#plt.show()

Looping through my process:

In [None]:
for ori in origin_list:
    plt.plot(X1,2.5*X1+B+ori)
    #plt.plot(X2,A*X2+B,'r')

In [None]:
for slope in origin_list:
    plt.plot(X1,slope*X1+B)

In [None]:
X = np.random.rand(20)
Y = 2.5 * X + 20

plt.plot(X, Y)
plt.scatter(X, Y, c='blue')
plt.show()

#### Exercise

Represent the logistic, or sigmoid, function between -20 and 20. Remember its formula is:

$$\displaystyle S(x)={\frac {1}{1+e^{-x}}}={\frac {e^{x}}{e^{x}+1}}$$

_Hint_: you will need an X and a Y to plot against it. 

_Hint_: check out the function np.exp

In [None]:
def sigmoid(X):
    return 1.0/(1.+np.exp(-X))

In [None]:
N = 100
L = 20
x = np.linspace(-L,L,N)
plt.plot(x,sigmoid(x),'b.')

### Indexing

![Python slicing](https://infohost.nmt.edu/tcc/help/pubs/python/web/fig/slicing.png)

In [None]:
print(a)
print(a2)

In [None]:
print(a[2],a2[0])

In [None]:
a2[1][0] 

In [None]:
a2[1,0]

### Slicing

In [None]:
a

In [None]:
a[:4]

In [None]:
a2[:]

In [None]:
a2[:2,:2]

In [None]:
1>3

In [None]:
a2 > 3

In [None]:
cond = a2>3

In [None]:
a2[cond]

In [None]:
a2[cond & (a2 % 2 == 0)]

In [None]:
evens = a2 % 2 == 0
gt_3 = a2 > 3
even_and_gt_3 = evens & gt_3

In [None]:
a2[evens]

In [None]:
a2[gt_3]

In [None]:
a2[even_and_gt_3]

In [None]:
a

In [None]:
condA=[True,False,False,True,True,False]

In [None]:
a[condA]

### Careful with copying!

In [None]:
b = a2[1]

In [None]:
b

In [None]:
b[1] = -1

In [None]:
b

In [None]:
a2

In [None]:
b=a2[1]

b[1]=-1
a2

In [None]:
b[:]=3

In [None]:
a2 

In [None]:
c=a2[1].copy()
c

In [None]:
c[:]=[-1000,-1000,-1000]

In [None]:
print(c)
print(a2)

### Element wise operations

In [None]:
print(a2)
a2*a2 

In [None]:
a2 -a2

In [None]:
1/a2

In [None]:
a2**2

In [None]:
np.sin(a2)

### Matrix operations

In [None]:
d = np.array([1.0,1.0,2.0,3.0,0])

In [None]:
1+1+4+9

In [None]:
d.dot(d)

In [None]:
a.dot(a)

In [None]:
a2.dot(np.arange(3))

In [None]:
a2

In [None]:
a2.transpose()

In [None]:
a2.T

In [None]:
print(a2.sum())
print(a2.cumsum())
print(a2.cumprod())

### `ndarray` vs matrix

In [None]:
a3 = np.mat(a2)
a3

In [None]:
a3.T

In [None]:
a2*a2

In [None]:
a3.T*a3

In [None]:
a3[:2,:2].I

In [None]:
a3[:2,:2].I*a3[:2,:2]

In [None]:
a3[:2,:2].det

### Linear Algebra

http://docs.scipy.org/doc/numpy-1.10.0/reference/routines.linalg.html

In [None]:
from numpy import linalg as la

help(la)

In [None]:
la.det(a3[:2,:2])

In [None]:
np.linalg.det(a3[:2,:2])

### Trace, determinant and inverse

In [None]:
np.trace(np.eye(3))

In [None]:
print(a2)
print(np.trace(a2))
print(np.trace(a2,offset=1))
print(np.trace(a2,offset=-1))

In [None]:
a = np.array([[1, 2], [3, 4]])
print(a)
print(a.shape)
la.det(a)

In [None]:
a = np.array([ [[1, 2], [3, 4]], [[1, 2], [2, 1]], [[1, 3], [3, 1]] ])
a

In [None]:
print(a.shape)
print(la.det(a))

In [None]:
la.inv(np.array([[1,2],[3,4]]))

In [None]:
la.inv(np.array([[1,2],[3,4]])).dot(np.array([[1,2],[3,4]]))

#### Exercise

In a chicken and rabbit farm, there are 35 heads and 94 legs. How many chickens and how many rabbits do we have?



Remember:

$$A \cdot X = b$$

$$A^{-1} \cdot A \cdot X = I \cdot X = A^{-1} \cdot b$$

$$X = A^{-1} \cdot b$$

In [None]:
a2

In [None]:
A = np.array([[1,1],[2,4]])

In [None]:
b = np.array([35,94])

In [None]:
np.linalg.solve(A,b)

In [None]:
X = np.linalg.inv(A).dot(b)
X

### Norm of a vector and a matrix

We can also calculate norms- the order 2 norm is the Euclidean distance, ie the length of a vector

In [None]:
la.norm(np.arange(3))

In [None]:
math.sqrt(0**2 + 1 ** 2 + 2 ** 2)

In [None]:
a = np.array([[1, 2], [3, 4]])
help(la.norm)

In [None]:
print(a)
print(la.norm(a))
print(la.norm(a,2))
print(la.norm(a,'fro'))
print(la.norm(a.reshape(4),2))
print(la.norm(a,1))
print(la.norm(a,np.inf))

In [None]:
b = np.arange(1,4)
print(b)
print(la.norm(b))
print(la.norm(b,2))
print(la.norm(b,1))
print(la.norm(b,np.inf))

#### Exercise

In a chicken and rabbit farm, there are 35 heads and 94 legs. How many chickens and how many rabbits do we have?



Remember: for every number $n$

$$A \cdot X = B$$

$$A^{-1} \cdot A \cdot X = I \cdot X = A^{-1} \cdot B$$

$$X = A^{-1} \cdot B$$

\* The language I've used to represent this formula is [$\LaTeX$](https://www.latex-project.org/). It's used to typeset all kinds of things from cvs to scientific articles to books. You can find a quick introduction [here](https://www.cs.princeton.edu/courses/archive/spr10/cos433/Latex/latex-guide.pdf). Almost everything you need to know to write equations in the notebook is on pages 4,5 and 6 of that pdf.

Example:

$A^x A_i A^-1$

In [None]:
A = np.mat([[2,4], [1,1]])
B = np.mat([94,35]).T

X = A.I.dot(B)
X

## A Linear Regression example with numpy

Now, we are ready to implement our own linear regression example. 

In linear regression, our hypothesis function $h_\theta$ is:

$$h_\theta(x) = \theta_0 + \theta_1x$$

And, as we are doing regression, our cost function is: 

$$J(\theta_0,\theta_1) = \frac{1}{m}\sum_{i=1}^m(\hat{y}_i-y_i)^2 = \frac{1}{m}\sum_{i=1}^m(h_\theta(x_i)-y_i)^2 $$

### Generate dummy data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

theta_0 = 2
theta_1 = 5

X = (np.random.randn(100) + 1) * 50
jitter = 50 * np.random.randn(100)
Y = theta_0 + theta_1 * X + jitter

plt.scatter (X, Y)

That is the spread that we will try to aproximate with our line.

### Write the cost function

In [None]:
def cost_function(X, Y):
    
    return lambda thetas: sum((thetas[0] + thetas[1] * X - Y) ** 2) / len(X)

J = cost_function(X, Y)
J([theta_0,theta_1])

In [None]:
from scipy.optimize import fmin

fmin(J, [0,0])

### Gradient descent

Remember, we have to descend in the direction of the steepest gradient. For that, we need to now what direction the gradient points!

### Partial differentials of the cost function

$$\frac{\partial}{\partial\theta_0} = \frac{1}{m}\sum_{i=1}^m(h_\theta(x_i)-y_i)$$

$$\frac{\partial}{\partial\theta_1} = \frac{1}{m}\sum_{i=1}^m(h_\theta(x_i)-y_i) \cdot x_i$$

In [None]:
def derivative_theta_0(X, Y):
    return lambda theta_0, theta_1: sum(theta_0 + theta_1 * X - Y) / len(X)

def derivative_theta_1(X, Y):
    return lambda theta_0, theta_1: sum((theta_0 + theta_1 * X - Y) * X) / len(X)

In [None]:
J_prime_0 = derivative_theta_0(X, Y)
J_prime_1 = derivative_theta_1(X, Y)

### Implementing gradient descent

Now we are ready to implement the actual gradient descent. Remember, the algorithm is:

- Initialize variables

- Compute cost function

- Compute gradients

- Update variables: gradient times learning rate (alpha)

- Repeat until convergence: cost at iteration n-1 $\sim$ cost at iteration n

In [None]:
theta_0 = np.random.randn()
theta_1 = np.random.randn()
J_prime_0 = derivative_theta_0(X, Y)
J_prime_1 = derivative_theta_1(X, Y)
convergence_criterion = 1e-1
converged = False
alpha = 10e-5
trace = []

for _ in range(100):
    trace.append([theta_0, theta_1])
    
    J_0 = J([theta_0, theta_1])

    diff_theta_0 = J_prime_0(theta_0, theta_1)
    diff_theta_1 = J_prime_1(theta_0, theta_1)
    
    theta_0 = theta_0 - alpha * diff_theta_0
    theta_1 = theta_1 - alpha * diff_theta_1
    
    J_1 = J([theta_0, theta_1])
    
    
    
    converged = abs(J_0 - J_1) < convergence_criterion

In [None]:
len(trace)

In [None]:
trace[-10:]

In [None]:
plt.plot(trace)

In [None]:
delta = 10
xs = np.arange(-1000, 1000, delta)
ys = np.arange(-1000, 1000, delta)

x, y = np.meshgrid(xs, ys)
zs =  np.array([J([t0, t1]) for t0 in xs for t1 in ys]).reshape(len(xs), len(ys)).T

m = plt.contour(x, y, zs)
plt.clabel(m, inline=1, fontsize=10)


In [None]:
delta = 1
xs = np.arange(-10, 10, delta)
ys = np.arange(-10, 10, delta)

x, y = np.meshgrid(xs, ys)
zs =  np.array([J([t0, t1]) for t0 in xs for t1 in ys]).reshape(len(xs), len(ys)).T

m = plt.contour(x, y, zs)
plt.clabel(m, inline=1, fontsize=10)

x_steps = [step[0] for step in trace]
y_steps = [step[1] for step in trace]
plt.scatter(x_steps, y_steps)

In [None]:
def make_line(theta_0, theta_1):
    xs = np.linspace(-100, 250)
    ys = theta_0 + theta_1 * xs
    return xs, ys

lines = [plt.plot(*make_line(*thetas), zorder = 1) for thetas in trace]
plt.scatter (X, Y, zorder = 2)

#### Exercise for home

Fix our linear regression so that it can find $\theta_0$ in a reasonable length of time. What can be the problem?

## `pandas`

### Getting started with pandas

In [None]:
import pandas as pd
import numpy as np

### `pandas` data structures

### Series

The base pandas abstraction. You can thing of it as the love child of a numpy array and a dictionary.

In [None]:
s = pd.Series([4, 7, -5, 3])
s

If we provide an index, pandas will use it. If not, it will automatically create one.

In [None]:
print(s.index)
print(s.values)

In [None]:
list('ifneurh')

In [None]:
s2 = pd.Series([1, 2, 4.5, 7, 2, 23, 15], index=['i', 'f', 'n', 'e', 'u', 'r', 'h'])
s2

In [None]:
s2['r']

In [None]:
s2 > 3

In [None]:
s2[s2>3]

In [None]:
evens = s2 % 2 == 0

In [None]:
s2[evens]

In [None]:
s2 * 2

In [None]:
np.exp(s2)

In [None]:
'f' in s2

In [None]:
clase = pd.Series([34, 22, 45, 72], index=['Toni', 'Fulanito', 'Menganito', 'Victor'])

In [None]:
clase[clase==22].index

We can create Series from dictionaries:

In [None]:
sdata = {'B' : 3e6, 'M': 6e6, 'P': 1.2e5, 'V': 7e5}

s3 = pd.Series(sdata)
s3

In [None]:
increase = {'M': 4e5, 'B' : 2e5, 'Z': -2e4}

s4 = pd.Series(increase)

And here is where the magic happens: numpy arrays only identify their contents by position. In contrast, pandas knows their "name" and will align them based on their indexes:

In [None]:
s3.values

In [None]:
s4.values

In [None]:
s3.values + s4.values

In [None]:
s3 + s4

In [None]:
s3.name = 'population_2000'
s3.index.name = 'province'

In [None]:
s3

### DataFrame

This is the object you'll work most of the time with. It represents a table of _m_ observations x _n_ variables. Each variable, or column, is a Series.

In [None]:
dfdata = {
    'province' : ['M', 'M', 'M', 'B', 'B'],
    'population': [1.5e6, 2e6, 3e6, 5e5, 1.5e6],
    'year' : [1900, 1950, 2000, 1900, 2000]   
}

df = pd.DataFrame(dfdata)
df

In [None]:
df2 = pd.DataFrame(dfdata, columns=['province','population', 'year', 'debt'])
df2

In [None]:
df2.index

In [None]:
df2.columns

In [None]:
df2[['population','province']]

In [None]:
df2.population

In [None]:
df2['2nd_language']=list('EEFFG')

In [None]:
df2['2nd_language'] = np.nan

In [None]:
df2.index = [list('EEFFG')]

In [None]:
df2

In [None]:
df2['2nd_language']

In [None]:
df2.2nd_language

In [None]:
# df2['abs']

In [None]:
df2.index = list('abcde')

In [None]:
df2

In [None]:
df2.loc['a']

In [None]:
df2['debt'] = 10
df2

In [None]:
df2['debt'] = [1,0,2,.5,.7]
df2

In [None]:
df2['capital'] = df2['province'] == 'M'
df2

In [None]:
df2.T

In [None]:
df2

In [None]:
df2.describe()

In [None]:
df2.describe().T

### Index objects

Indexes are immutable.

In [None]:
df2.index[1] = 'x'

In [None]:
df2.index[1]

In [None]:
df2.iloc[2:]

### Dropping entries from an axis

In [None]:
s5 = pd.Series(np.arange(5), list('jduvk'))
s5

In [None]:
s6 = s5.drop(['d','k'])
s6

In [None]:
s5

In [None]:
s5.drop(['d','k'],inplace=False)

In [None]:
s5

By default, `drop()` doesn't modify the original Series- it creates a copy. We can change that with the argument `inplace`.

In [None]:
s5

In [None]:
s6['u'] = 7
s5

In [None]:
df2

In [None]:
df2.drop('c')

In [None]:
df2

In [None]:
df2.drop('c', axis=0)

In [None]:
df2.drop('2nd_language', axis=1)

In [None]:
df3 = df2.drop('2nd_language', axis=1)

In [None]:
df3

In [None]:
df4 = df3

In [None]:
df4.drop(['a','b'],inplace=True)

In [None]:
df4

In [None]:
df3

In [None]:
df3 = df2.copy()
df3

In [None]:
df3.drop('capital', axis=1, inplace=True)
df3

In [None]:
df2

#========================================================================

# Toni dio hasta AQUI el 16/11/2019

### Indexing, selection, and filtering

The key here is that we can build boolean Series that we can use to index the original Series or DataFrame. Those booleans can be combined with bitwise boolean operators (&, |, ~) to get filters that are as complex as we need. 

In [None]:
s3

In [None]:
s3[['V', 'M']]

In [None]:
s3[2:]

In [None]:
s3['P':'V']

In [None]:
s3 > 1e06

In [None]:
s3[s3>1e06]

In [None]:
df3

In [None]:
df3[df3['year'] > 1950]

In [None]:
df3[(df3['year'] > 1900) & (df3['debt'] > 1)]

In [None]:
recent = df3['year'] > 1900
indebted = df3['debt'] > 1

df3[recent & indebted]

In [None]:
df3[df3['year'] > 1900][df3['debt'] > 1]

### Function application and mapping

Function application and mapping allows us to modify the elements of a DataFrame (columns with apply or elements with applymap) without for loops. This way we are not constrained to the functions already implemented by pandas or numpy.

In [None]:
df3

In [None]:
np.sqrt(df3['population'])

In [None]:
df4 = pd.DataFrame(np.random.randn(4,3) * 17 + 15, columns=list('bde'), index=list('BMPZ'))
df4

In [None]:
np.abs(df4)

This is a typical use case for lambdas (anonymous functions)

In [None]:
df4.apply(lambda series: series.max() - series.min())

In [None]:
df4.applymap(lambda element: element % 10 )

In [None]:
df4.apply(lambda series: series.max() - series.min(), axis=1)

In [None]:
def f(series):
    return pd.Series([series.max(), series.min()], index=['max', 'min'])

df4.apply(f)

In [None]:
for item in df4.items():
    print(item)

In [None]:
for item in df4.iteritems():
    print(item)

In [None]:
map(f, [1,2])

In [None]:
def format_2digits(number):
    return '%.2f' % number

In [None]:
df4.applymap(format_2digits)

### Sorting and ranking

In [None]:
df4.sort_index(ascending=False)

In [None]:
df4.sort_index(ascending=False, axis=1)

In [None]:
df4.sort_values(by='e')

In [None]:
df4.sort_values(by=['e','b'])

In [None]:
s1 = pd.Series([2,3,8,4,3,2,1], index=list('abcdefg'))
s1

In [None]:
s1.sort_values()

rank() returns the positions of the elements of the Series in its sorted version. If there are ties, it will take averages.

In [None]:
s1.rank()

In [None]:
pd.Series([1,1,1]).rank()

In [None]:
s2 = pd.Series([30,10,20], index=list('abc'))
s2

In [None]:
s2.rank()

In [None]:
help(s2.rank)

#### Exercise

Write a function that takes a Series and returns the top 10% registers. In this case, earners. Test it with this Series:

```python
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])
```

In [None]:
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])

In [None]:
def top_earners(serie):
    number_to_extract = round(len(serie) / 10)
    return salaries.sort_values()[-number_to_extract:]

top_earners(salaries)

In [None]:
def top_earners(serie, percentile=0.9):
    is_top_earner = serie.rank(pct=True) > percentile
    return serie[is_top_earner]

print(top_earners(salaries))
print(top_earners(salaries, .8))

## Summarizing and computing descriptive statistics

In [None]:
x = pd.Series([1.2, np.nan, 4, np.nan, 9], index=list('abcde'))
y = pd.Series([5, 3, 7, np.nan, 14], index=list('abcde'))

df = pd.DataFrame([x, y], index=['x','y']).T
df

In [None]:
df.sum()

As with many methods, we can use them in the direction perpendicular to their default.

In [None]:
df.sum(axis=1)

In [None]:
pd.__version__

In [None]:
df.sum(axis=1, skipna=False)

In [None]:
df.mean()

In [None]:
df.mean(axis=1)

In [None]:
df.cumsum()


In [None]:
df.std()

In [None]:
df.describe()

In [None]:
df['x'].sum()

In [None]:
df['x'].describe()

### Unique values, value counts, and membership

In [None]:
s7 = pd.Series(list('gtcaaagcttcga'))
s7

In [None]:
s7.unique()

In [None]:
s7.value_counts()

In [None]:
puric_bases = ['a','g']
s7.isin(puric_bases)

In [None]:
s7[s7.isin(puric_bases)]

## Handling missing data

In [None]:
string_data = pd.Series(['Ma', 'Lu', 'Ca', 'Va', np.nan])
string_data

In [None]:
string_data[string_data!=np.nan]

This is weird... but it has some really good reasons. You can find explanations [here](https://stackoverflow.com/questions/10034149/why-is-nan-not-equal-to-nan) and [here](https://stackoverflow.com/questions/1565164/what-is-the-rationale-for-all-comparisons-returning-false-for-ieee754-nan-values)

In [None]:
np.nan == np.nan

In [None]:
string_data[~string_data.isnull()]

### Filtering out missing data

In [None]:
string_data[string_data.notnull()]

In [None]:
df5 = pd.DataFrame([[1,2,3], 
                    [np.nan, 8, 7], 
                    [4, np.nan, 90], 
                    [67,42,53]], 
                   columns=list('abc'))
df5

In [None]:
df5[df5['a'].notnull()]

In [None]:
df5.notnull()

any() and all() are functions of boolean Series. They reduce the Series to a single boolean value by applying repeatedly the operators "or" and "and", respectively.

In [None]:
df5.notnull().any()

In [None]:
df5.notnull().all()

In [None]:
df5.isnull().any()

In [None]:
df5.dropna()

In [None]:
df5

In [None]:
df5.dropna(axis=1)

In [None]:
array = np.random.randn(8,3) * 20 + 100

df6 = pd.DataFrame(array, columns=list('xyz'), index=list('abcdefgh'))
df6.iloc[2:5, 1] = np.nan
df6.iloc[1:3, 2] = np.nan
df6

The thresh argument specifies the minimum number of non-null values required to keep a column (or row, with axis=1)

In [None]:
df6.dropna(thresh=2)

In [None]:
df6.dropna(thresh=2, axis=1)

In [None]:
df6.dropna(thresh=6, axis=1)

### Filling in missing data

In [None]:
df6.fillna(0)

In [None]:
df6.fillna({'x' : 100, 'y' : 50, 'z' : 20})

In [None]:
df6

In [None]:
df6.fillna(method='ffill')

In [None]:
df6.fillna(df6.median())

In [None]:
df6.median()

# Additional References

[Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do)

[What is SciPy?](https://www.scipy.org/)

[How can SciPy be fast if it is written in an interpreted language like Python?](https://www.scipy.org/scipylib/faq.html#how-can-scipy-be-fast-if-it-is-written-in-an-interpreted-language-like-python)

[What is the difference between NumPy and SciPy?](https://www.scipy.org/scipylib/faq.html#what-is-the-difference-between-numpy-and-scipy)

[Linear Algebra for AI](https://github.com/fastai/fastai/blob/master/tutorials/linalg_pytorch.ipynb)