In [2]:
# %load common.py
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import matplotlib.pyplot as plt

import statsmodels.api as sm

import seaborn as sns

def ols(y, x):
    return sm.OLS(y, sm.add_constant(x)).fit()

def load_autos_dataset():
    auto = pd.read_csv('./data/Auto.csv', na_values='?', index_col='name')
    auto = auto.dropna()
    return auto

def load_boston_dataset():
    rdataset = sm.datasets.get_rdataset('Boston', 'MASS', True)
    return rdataset.data

def load_carseats_dataset():
    return pd.read_csv('./data/Carseats.csv', index_col=0)


  from pandas.core import datetools


In [3]:
def get_usarrests_dataset():
    rdataset = sm.datasets.get_rdataset('USArrests')
    return rdataset.data

In [4]:
usarrests = get_usarrests_dataset()

In [6]:
usarrests.columns

Index(['Murder', 'Assault', 'UrbanPop', 'Rape'], dtype='object')

In [7]:
usarrests.describe()

Unnamed: 0,Murder,Assault,UrbanPop,Rape
count,50.0,50.0,50.0,50.0
mean,7.788,170.76,65.54,21.232
std,4.35551,83.337661,14.474763,9.366385
min,0.8,45.0,32.0,7.3
25%,4.075,109.0,54.5,15.075
50%,7.25,159.0,66.0,20.1
75%,11.25,249.0,77.75,26.175
max,17.4,337.0,91.0,46.0


In [10]:
usarrests.var(axis=0)

Murder        18.970465
Assault     6945.165714
UrbanPop     209.518776
Rape          87.729159
dtype: float64

In [32]:
from sklearn.decomposition import PCA
pca = PCA()

In [19]:
usarrests_std = (usarrests - usarrests.mean()) / usarrests.std()

In [29]:
usarrests_std.mean(axis=0)

Murder      1.543210e-16
Assault     1.115774e-16
UrbanPop   -4.307665e-16
Rape        8.704149e-16
dtype: float64

In [30]:
usarrests_std.std()

Murder      1.0
Assault     1.0
UrbanPop    1.0
Rape        1.0
dtype: float64

In [33]:
pca.fit(usarrests_std)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [34]:
pca.components_

array([[ 0.53589947,  0.58318363,  0.27819087,  0.54343209],
       [ 0.41818087,  0.1879856 , -0.87280619, -0.16731864],
       [-0.34123273, -0.26814843, -0.37801579,  0.81777791],
       [ 0.6492278 , -0.74340748,  0.13387773,  0.08902432]])

In [35]:
pca.explained_variance_

array([ 2.48024158,  0.98976515,  0.35656318,  0.17343009])

In [36]:
pca.explained_variance_ratio_

array([ 0.62006039,  0.24744129,  0.0891408 ,  0.04335752])

In [45]:
x = np.random.normal(size=(2,25))

In [46]:
x

array([[-0.59999448, -1.16758941, -0.78688966,  1.35359972, -1.05258981,
        -1.24798481, -0.39144593, -0.5175452 ,  0.53139393, -1.23670896,
         0.86967154, -1.42442979, -0.44962706, -0.01443289, -0.18236743,
         0.83654455,  0.58899987, -0.66213923, -0.68059788, -0.09527533,
         0.27971584, -1.24626772, -0.2694203 , -1.63076827,  0.17196682],
       [ 0.63141818,  0.15201701,  1.89715525, -1.7054607 ,  0.43082858,
         0.51048398, -0.37576741, -1.1832589 ,  0.3302341 ,  0.36762636,
        -1.05531466, -0.14726889,  0.99355395,  1.00649879, -0.13391217,
        -0.3507843 , -0.38800363,  0.56847732, -0.29525985,  1.20628059,
        -0.72503123, -1.11024226,  1.06030585,  1.55845171, -1.36360491]])

In [50]:
x[0] = x[0] + 3
x[1] = x[1] - 4

In [51]:
x

array([[ 2.40000552,  1.83241059,  2.21311034,  4.35359972,  1.94741019,
         1.75201519,  2.60855407,  2.4824548 ,  3.53139393,  1.76329104,
         3.86967154,  1.57557021,  2.55037294,  2.98556711,  2.81763257,
         3.83654455,  3.58899987,  2.33786077,  2.31940212,  2.90472467,
         3.27971584,  1.75373228,  2.7305797 ,  1.36923173,  3.17196682],
       [-3.36858182, -3.84798299, -2.10284475, -5.7054607 , -3.56917142,
        -3.48951602, -4.37576741, -5.1832589 , -3.6697659 , -3.63237364,
        -5.05531466, -4.14726889, -3.00644605, -2.99350121, -4.13391217,
        -4.3507843 , -4.38800363, -3.43152268, -4.29525985, -2.79371941,
        -4.72503123, -5.11024226, -2.93969415, -2.44154829, -5.36360491]])