# This is the Python Code for Chapter2 ''Statistical Learning"

## 2.3.1 Basic Commands

In [None]:
import numpy as np  # for calculation purpose, let use np.array 
import random # for the random 

x = np.array([1, 3, 2, 5])
print x

In [None]:
x = np.array([1, 6, 2])
print x
y = [1, 4, 3]

### use len() to find length of a vector

In [None]:
len(x)  

In [None]:
len(y)

In [None]:
print(x + y) # please note that we define x and y a little bit differently, but we still can do the calculation 
y = np.array([1, 4, 3])

In [None]:
whos

In [None]:
del x # reset_selective x

In [None]:
%whos

In [None]:
reset?

In [None]:
x = [[1,2],[3, 4]]
print x

In [None]:
x = np.array([1, 2, 3, 4])

In [None]:
x = np.reshape(x, [2,2])
print x

In [None]:
np.sqrt(x)

x**2
np.square(x)

In [None]:
mu, sigma = 0, 1
x = np.random.normal(mu, sigma, 50)
y = x + np.random.normal(50, 0.1, 50)
print x, y

In [None]:
np.corrcoef(x, y) 

### Above will return the correlation matrix 

In [None]:
np.corrcoef(x, y)[0,1]

In [None]:
import random 
random.seed(2333)

In [None]:
np.random.normal(mu, sigma, 50) # after set up the seed, this should genernate the same result

In [None]:
y = np.random.normal(mu, sigma, 100)

In [None]:
print np.mean(y)

In [None]:
print np.var(y)

In [None]:
print np.sqrt(np.var(y))
print np.std(y)

### if we raise the number of sample to a larger number, the mean and std will be more close to (0, 1)

In [None]:
y = np.random.normal(mu, sigma, 5000)
print np.mean(y)
print np.std(y)

## 2.3.2 Graphics

In [None]:
import numpy as np  # for calculation purpose, let use np.array 
import random # for the random 

x = np.random.normal(0, 1, 100)
y = np.random.normal(0, 1, 100)

# In python, matplotlib is the most used library for plot 
# matplotlib.pyplot is a collection of command style functions that make matplotlib work like MATLAB.
import matplotlib.pyplot as plt


plt.plot(x, y, 'bo') # please use plt.plot? to look at more options 
plt.ylabel("this is the y-axis")
plt.xlabel("this is the x-axis")
plt.title("Plot of X vs Y")
plt.savefig('Figure.pdf') # use plt.savefig function to save images
plt.show() 


In [None]:
x = np.arange(1, 11) # note the arange excludes right end of rande specification 
print x 



### note: this actually can result in unexpected results; check np.arange(0.2, 0.6, 0.4) vs np.arange(0.2, 1.6, 1.4);

In [None]:
# in order to use Pi, math module needs to loaded first
import math
x = np.linspace(-math.pi, math.pi, num = 50)
print x

In [None]:
import matplotlib.cm as cm
import matplotlib.mlab as mlab
y = x
X, Y = np.meshgrid(x,y)

In [None]:
whos

In [None]:
f = np.cos(Y)/(1 + np.square(X))
CS = plt.contour(X, Y, f)
plt.show()


### same as above, use plt.contour? to explore the options

In [None]:
fa = (f - f.T)/2 #f.T for transpose or tranpose(f)
plt.imshow(fa, extent=(x[0], x[-1], y[0], y[-1])) 
plt.show()

### I think imshow looks nicer for heatmap, use 'extent =' fix the x, y axis

In [None]:
from mpl_toolkits.mplot3d import axes3d
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(X, Y, fa)

plt.show()

## 2.3.3 Indexing Data

In [None]:
A = np.arange(1,17,1).reshape(4, 4).transpose()
print A

In [None]:
A[2, 3]

### try the same index as the book, but we got different number. The reason is R starts the index from 1 (Matlab too), but Python starts the index from 0. To select the same number (10) as the book did, we reduce the index by 1

In [None]:
A[1, 2]

### to select a submatrix, need the non-singleton dimension of your indexing array to be aligned with the axis you're indexing into, e.g. for an n x m 2D subarray: A[n by 1 array,1 by m array]

In [None]:
A[[[0],[2]], [1,3]]

In [None]:
A[0:3:1, 1:4:1] # this is another way of doing it

In [None]:
A[0:2,:]

In [None]:
A[:,0:2]

### The last two examples include either no index for the columns or no index for the rows. These indicate that Python should include all columns or all rows, respectively

In [None]:
A[0,:]

### '-' sign has a different meaning in Python. This means index from the end, -1 means the last element 

In [None]:
A[-1, -1] 

### There are quite a few ways to let Python keep all rows except certain index. Here boolean was used.

In [None]:
ind = np.ones((4,), bool)
ind[[0,2]] = False

In [None]:
ind

In [None]:
A[ind,:]

In [None]:
A[ind]

In [None]:
A.shape

## 2.3.4 Loading Data

### In Python, Pandas is a common used module to read from file into a data frame. I downloaded the Auto.csv from the book website. First, take a look at the csv file. There are headers, missing value is marked by '?'  .

In [None]:
import pandas as pd 
Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')

### check one record with missing value, and make sure the missing value is correctly imported 

In [None]:
Auto.iloc[32]

### Use the same function as in ndarray to find out the dimension of the data frame 

In [None]:
Auto.shape

In [None]:
Auto[:4]

In [None]:
Auto.iloc[:4, :2]

In [None]:
list(Auto)

### Use .isnull and .sum to find out how many NaNs in each variables

In [None]:
Auto.isnull().sum()

### after the previous steps, there are 397 obs in the data and only 5 with missing values. We can just drop the ones with missing values  

In [None]:
Auto = Auto.dropna()

In [None]:
Auto.shape

## 2.3.5 Additional Graphical and Numerical Summaries

### refer a column of data frame by name, by using a '.'. Ref the options in plt.plot for more.

In [None]:
plt.plot(Auto.cylinders, Auto.mpg, 'ro')
plt.show()

### Use .hist to get the histogram of certain variables. column = to specify which variable

In [None]:
Auto.hist(column = ['cylinders', 'mpg'])
plt.show()

### Use the .describe() to get a summary of the data frame. Use .describe ( include = 'all' ) for mix types, use describe(include = [np.number]) for numerical columns, use describe(include = ['O']) for objects.

In [None]:
Auto.describe()

### We can change type of certain variable(s). Here changed the cylinders into categorical variable 

In [None]:
Auto['cylinders'] = Auto['cylinders'].astype('category')

In [None]:
Auto.describe()

In [None]:
Auto.describe(include= 'all')