# Computing - Chapter 03 - (Python)

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# load the data from a file
DATA_PATH = "../data"
segDataRaw = pd.read_csv(DATA_PATH + "/segmentationOriginal.csv")
segDataRaw.head()

Unnamed: 0,Cell,Case,Class,AngleCh1,AngleStatusCh1,AreaCh1,AreaStatusCh1,AvgIntenCh1,AvgIntenCh2,AvgIntenCh3,...,VarIntenCh1,VarIntenCh3,VarIntenCh4,VarIntenStatusCh1,VarIntenStatusCh3,VarIntenStatusCh4,WidthCh1,WidthStatusCh1,XCentroid,YCentroid
0,207827637,Test,PS,143.247705,1,185,0,15.711864,3.954802,9.548023,...,12.474676,7.609035,2.7141,0,2,2,10.642974,2,42,14
1,207932307,Train,PS,133.752037,0,819,1,31.923274,205.878517,69.91688,...,18.809225,56.715352,118.388139,0,0,0,32.161261,1,215,347
2,207932463,Train,WS,106.646387,0,431,0,28.038835,115.315534,63.941748,...,17.295643,37.671053,49.470524,0,0,0,21.185525,0,371,252
3,207932470,Train,PS,69.150325,0,298,0,19.45614,101.294737,28.217544,...,13.818968,30.005643,24.749537,0,0,2,13.39283,0,487,295
4,207932455,Test,PS,2.887837,2,285,0,24.275735,111.415441,20.474265,...,15.407972,20.504288,45.450457,0,0,0,13.198561,0,283,159


In [7]:
# isolate just the training examples
segData = segDataRaw[segDataRaw.Case == "Train"]

In [8]:
# pull out some of the columns
cellId = segData.Cell
cellClass = segData['Class']
case = segData.Case

In [9]:
# now remove the columns
segData = segData.drop(['Cell', 'Class', 'Case'], axis=1)

In [10]:
# remove all "Status" columns
segData = segData.drop([c for c in segData.columns if "Status" in c], axis=1)

## Transformations

### Skewness

In [11]:
# for one predictor
from scipy.stats import skew
skew(segData.AngleCh1)

-0.024298630435426738

In [12]:
# apply across all columns
skewValues = segData.apply(skew, axis=0)
[print(x) for x in list(zip(segData.columns, skewValues))[0:5]];

('AngleCh1', -0.024298630435426738)
('AreaCh1', 3.5303544460710095)
('AvgIntenCh1', 2.9635898861967189)
('AvgIntenCh2', 0.84942278590563092)
('AvgIntenCh3', 2.2056202512609824)


### Box Cox Transformation

In [31]:
# find the appropriate box cox transform and apply to values
from scipy.stats import boxcox

(boxCoxTrans, lmbda) = boxcox(segData.AreaCh1)
print(segData.AreaCh1.describe())
print('Largest/Smallest: %.1f' % (segData.AreaCh1.max()/segData.AreaCh1.min()))
print('Sample Skewness: %.2f' % (segData.AreaCh1.skew(),))

print('Estimated Lambda: %f' % (lmbda,))
print('example transformed values: %s ...' % boxCoxTrans[0:5])

count    1009.000000
mean      325.125867
std       216.555276
min       150.000000
25%       194.000000
50%       256.000000
75%       376.000000
max      2186.000000
Name: AreaCh1, dtype: float64
Largest/Smallest: 14.6
Sample Skewness: 3.54
Estimated Lambda: -0.855869
example transformed values: [ 1.16465199  1.16190479  1.15949121  1.15825373  1.15832111] ...


In [32]:
# for comparison, perform the manual transformation using the lambda value generated above
segData.AreaCh1.head()

1     819
2     431
3     298
11    256
14    258
Name: AreaCh1, dtype: int64

In [33]:
import math

print("Automatically generated transformation: %f" % boxCoxTrans[0])
manuallyGenerated = (math.pow(segData.AreaCh1[1], lmbda) - 1) / lmbda
print("Manually generated transformation: %f" % manuallyGenerated)

Automatically generated transformation: 1.164652
Manually generated transformation: 1.164652


### Scaling

In [None]:
import sklearn.preprocessing

scaler = sklearn.preprocessing.StandardScaler().fit(segData)

print(scaler.mean_[0:5])
print(scaler.scale_[0:5])

scaledData = scaler.transform(segData)

# TODO: zip columns, scaler mean/std, pd mean/std

### PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pcaObject = pca.fit(scaledData)
percentVariance = [pctVariance * 100 for pctVariance in pcaObject.explained_variance_ratio_]
percentVariance[0:3]

In [None]:
pcaTransform = DataFrame(pca.transform(scaledData)).iloc[0:5, 0:5]
pcaTransform.columns = ["PC1", "PC2", "PC3", "PC4", "PC5"]
pcaTransform

In [None]:
pcaComponents = DataFrame(pcaObject.components_.T).iloc[0:6, 0:5]
pcaComponents.columns = ["PC1", "PC2", "PC3", "PC4", "PC5"]
pcaComponents.set_index(segData.columns[0:6])

### Spatial Sign

See ... (**TODO**)