# 2.3 Lab: Introduction to R

## 2.3.1 Basic Commands

In [None]:
# best practice is to have all the modules imported at the top of the file, but for this one, I will import them when we need them
import numpy as np  # for calculation purpose, let us use np.array 
import random # for the random number generation

x = np.array([1, 3, 2, 5])
# use print to see the array
print(x)

In [None]:
x = np.array([1, 6, 2])
print(x)
y = [1, 4, 3]
print(y)

In [None]:
# use len() to find length of a vector
len(x)  

In [None]:
len(y)

In [None]:
print(x + y) # please note that we define x and y a little bit differently, but we still can do the calculation 

In [None]:
# The whos function allows us to look at a list of all of the objects, such as data and functions, that we have saved so far
%whos

In [None]:
# reset_selective x
del x 

In [None]:
%whos

In [None]:
# read the description of a function 
%whos?

In [None]:
# get a matrix 
x = [[1,2],[3, 4]]
print(x)

In [None]:
# we could also reshape a one dimensional array to a matrix
x = np.array([1, 2, 3, 4])
print(x)
x = np.reshape(x, [2,2])
print(x)

In [None]:
# then we can use the matrix to do some calculations
np.sqrt(x)
x**2
np.square(x)

In [None]:
# use random to generate random numbers/arrays/matrices
mu, sigma = 0, 1
x = np.random.normal(mu, sigma, 5)
y = x + np.random.normal(20, 0.1, 5)
print(x)
print(y)


In [None]:
# more calculation
np.corrcoef(x, y) 

In [None]:
# above will return the correlation matrix, let us see just the correlation coefficient between x and y
np.corrcoef(x, y)[0,1]

In [None]:
# we can use the seed function to set up the random seed, so that every thing we run the code, we will get the same result
random.seed(2333)

In [None]:
# after set up the seed, this should genernate the same result everytime we run the notebook
np.random.normal(mu, sigma, 5) 

In [None]:
# you could increase the number of samples to see the empirical distribution coverages to the theoretical distribution
mu, sigma = 0, 1
num_samples = 10
x = np.random.normal(mu, sigma, num_samples)
print(np.mean(x))
print(np.var(x))
print(np.sqrt(np.var(x)))
print(np.std(x))

## 2.3.2 Graphics

In [None]:
import numpy as np  # for calculation purpose, let use np.array 
import random # for the random 

x = np.random.normal(0, 1, 100)
y = np.random.normal(0, 1, 100)

# in python, matplotlib is the most used library for plot 
# matplotlib.pyplot is a collection of command style functions that make matplotlib work like MATLAB.
import matplotlib.pyplot as plt


plt.plot(x, y, 'bo') # please use plt.plot? to look at more options 
plt.ylabel("this is the y-axis")
plt.xlabel("this is the x-axis")
plt.title("Plot of X vs Y")
plt.savefig('Figure.pdf') # use plt.savefig function to save images
plt.show() 


In [None]:
# note the arange excludes right end of rande specification 
x = np.arange(1, 11) 
print(x) 

In [None]:
# note: np.arange actually can result in unexpected results; check np.arange(0.2, 0.6, 0.4) vs np.arange(0.2, 1.6, 1.4)
print(np.arange(0.2, 0.6, 0.4))
print(np.arange(0.2, 1.6, 1.4))

In [None]:
# in order to use Pi, math module needs to loaded first
import math
x = np.linspace(-math.pi, math.pi, num = 50)
print(x)

In [None]:
import matplotlib.cm as cm
import matplotlib.mlab as mlab
y = x
X, Y = np.meshgrid(x,y)

In [None]:
%whos

In [None]:
# same as above, use plt.contour? to explore the options
f = np.cos(Y)/(1 + np.square(X))
CS = plt.contour(X, Y, f)
plt.show()

In [None]:
# I think imshow looks nicer for heatmap, use 'extent =' fix the x, y axis
fa = (f - f.T)/2 #f.T for transpose or tranpose(f)
plt.imshow(fa, extent=(x[0], x[-1], y[0], y[-1])) 
plt.show()

In [None]:
from mpl_toolkits.mplot3d import axes3d
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(X, Y, fa)
plt.show()

## 2.3.3 Indexing Data 
Here we use np array. If the data structure is something else, the method below may not work

In [None]:
A = np.arange(1,17,1).reshape(4, 4).transpose()
print(A)

In [None]:
# one thing to note here is that in python, the index starts from 0, not 1
print(A[2, 3])

In [None]:
# try the same index as the book, but we got different number. The reason is R starts the index from 1 (Matlab too), but Python starts the index from 0. To select the same number (10) as the book did, we reduce the index by 1
print(A[1, 2])

In [None]:
# to select a submatrix, need the non-singleton dimension of your indexing array to be aligned with the axis you're indexing into, 
# e.g. for an n x m 2D subarray: A[n by 1 array,1 by m array]
A[[[0],[2]], [1,3]]

In [None]:
# this is another way to do that
A[0:3:2, 1:4:2] 

In [None]:
# select all columns in those two rows 
A[0:3:2,:]

In [None]:
# select all row in those two columns 
A[:, 1:4:2] 

In [None]:
# the last two examples include either no index for the columns or no index for the rows. These indicate that Python should include all columns or all rows, respectively
A[0,:]

In [None]:
# '-' sign has a different meaning and good usage in Python. This means index from the end, -1 means the last element 
A[-1, -1] 

In [None]:
# there are other ways to let Python keep all rows except certain index. For example, we could also use boolean. 
ind = np.ones((4,), bool)
ind[[0,2]] = False
print(ind)

In [None]:
A[ind,:]

In [None]:
# we do not specify the row or column, the default is the for the row
A[ind]

In [None]:
# we use .shape to get the shape of the matrix 
A.shape

## 2.3.4 Loading Data

In Python, Pandas is a common used module to read from file into a data frame. I downloaded the Auto.csv from the book website. First, take a look at the csv file. There are headers, missing value is marked by '?'.

In [None]:
import pandas as pd 
Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')

In [None]:
# we could use .head to see the first few rows (default = 5) of the data 
Auto.head()

In [None]:
# check one record with missing value, and make sure the missing value is correctly imported. 
# Here we use the i.loc to select the row which is different from the indexing method above
# the reason is that Auto is a pandas dataframe, while the indexing method was for a numpy array
Auto.iloc[32]

In [None]:
# Use the same .shape function as in ndarray to find out the dimension of the data frame 
Auto.shape

In [None]:
# an alternative way to select the first 4 rows. 
Auto[:4]

In [None]:
# an alternative way to select the first 4 rows and first 2 columns.
Auto.iloc[:4, :2]

In [None]:
# we can use list to find the column names or use .columns
print(list(Auto))
print(Auto.columns)

In [None]:
# Use .isnull and .sum to find out how many NaNs in each variables
Auto.isnull().sum()

In [None]:
# after the previous steps, there are 397 obs in the data and only 5 with missing values. We can just drop the ones with missing values  
print(Auto.shape)
Auto = Auto.dropna()
print(Auto.shape)

## 2.3.5 Additional Graphical and Numerical Summaries

In [None]:
# refer a column of data frame by name, by using a '.'. Ref the options in plt.plot for more.
plt.plot(Auto.cylinders, Auto.mpg, 'ro')
plt.show()

In [None]:
# use .hist to get the histogram of certain variables. column = to specify which variable
Auto.hist(column = ['cylinders', 'mpg'])
plt.show()

In [None]:
# use the .describe() to get a summary of the data frame. Use .describe ( include = 'all' ) for mix types, use describe(include = [np.number]) for numerical columns, use describe(include = ['O']) for objects.
Auto.describe()

In [None]:
# we can change type of certain variable(s). Here changed the cylinders into categorical variable 
Auto['cylinders'] = Auto['cylinders'].astype('category')

In [None]:
Auto.describe()

In [None]:
Auto.describe(include= 'all')

In [None]:
# End of Chapter 2