In [86]:
from io import StringIO
import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '{:.2f}'.format(x))
%precision 2

'%.2f'

In [87]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0,'''

In [88]:
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


###### Print the count of null / np.nan values per row and per column

In [89]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [90]:
df.isnull().sum(axis=1)

0    0
1    1
2    1
dtype: int64

##### Show the underlying numpy array

In [91]:
df.values

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,  nan,   8.],
       [  0.,  11.,  12.,  nan]])

##### Drop missing values (1) per row and (2) per column

In [92]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [93]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


##### What are the effects of the optional parameters of `.dropna()`?

##### Answer:

##### Use [sklearn.preprocessing.Imputer](http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values) to replace the missing values by the column mean.

In [94]:
from sklearn.preprocessing import Imputer

In [95]:
Imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [96]:
imp.fit(df)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [97]:
df2 = imp.transform(df)

In [98]:
df2

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [  0. ,  11. ,  12. ,   6. ]])

In [99]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


##### Use [sklearn.preprocessing.scale](http://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling) to standardize the data.

In [100]:
from sklearn import preprocessing

In [101]:
X_train = df2
X_scaled = preprocessing.scale(X_train)

In [115]:
X_scaled


array([[-0.46, -1.18, -1.22, -1.22],
       [ 1.39, -0.09,  0.  ,  1.22],
       [-0.93,  1.27,  1.22,  0.  ]])

##### Show mean and standard deviation after the last transformation.

In [157]:
X_scaled.mean()

0.00

In [159]:
X_scaled.std()

1.00

##### Create a DataFrame with 3 columns with labels 'y', 'x1' and 'x2', and 100 rows of random integers in [-20, 80].

In [106]:
data = np.random.randint(-20, 80, size=(100,3))
columns = ['y', 'x1', 'x2']
df = pd.DataFrame(data=data)
df.columns = columns
df.columns


Index(['y', 'x1', 'x2'], dtype='object')

In [107]:
df.head()

Unnamed: 0,y,x1,x2
0,10,47,14
1,13,59,35
2,54,-4,2
3,31,5,59
4,40,28,-10


##### Use [sklearn.crossvalidation](http://scikit-learn.org/stable/modules/cross_validation.html) to split the DataFrame into a train set with 80 rows and a test set with 20 rows

In [108]:
from sklearn import cross_validation
X = df[['x1', 'x2']]
y = df.y
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)



In [110]:
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

(80, 2)
(20, 2)
(80,)
(20,)


##### Use [sklearn.preprocessing.StandardScaler](http://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling) to standardize both train and test data

In [128]:
from sklearn.preprocessing import StandardScaler

In [146]:
scaler = StandardScaler()
scaler.fit(X_train, X_test)
scaler.transform(X_train, X_test)


array([[ 1.15,  0.35],
       [ 0.46, -0.49],
       [ 0.02,  0.06],
       [-0.31,  0.73],
       [-0.31, -0.24],
       [-1.23, -0.45],
       [-0.2 ,  0.31],
       [ 1.  , -0.03],
       [ 1.11,  0.27],
       [ 1.88,  1.49],
       [ 0.27, -1.42],
       [-0.28, -0.4 ],
       [ 1.37,  1.32],
       [-1.15,  0.69],
       [ 1.48, -0.15],
       [ 1.66,  1.24],
       [-0.79,  2.08],
       [ 0.86,  0.82],
       [-1.63,  1.62],
       [-1.37, -0.49],
       [ 0.13,  0.73],
       [-1.34, -1.88],
       [ 0.86, -0.11],
       [-0.35,  0.65],
       [-1.12,  0.19],
       [-1.3 ,  0.4 ],
       [-0.79, -0.83],
       [ 1.26,  0.27],
       [-0.31, -0.45],
       [-1.23,  0.27],
       [ 0.86,  1.45],
       [-0.93, -1.29],
       [ 0.02, -1.54],
       [-0.82,  0.02],
       [-0.02, -1.88],
       [-0.02,  1.74],
       [ 1.81, -1.16],
       [-0.13, -0.91],
       [-0.9 , -1.79],
       [ 0.09, -0.19],
       [-1.59,  0.9 ],
       [-1.08,  0.44],
       [ 0.27, -1.2 ],
       [ 0.

In [147]:
scaler.fit(y_train, y_test)
scaler.transform(y_train, y_test)



array([-0.55, -0.25,  0.99,  1.81,  0.73, -1.04, -0.51,  0.95, -0.44,
       -1.26, -0.59, -1.45,  1.36,  1.44, -1.68,  1.44, -1.45, -0.85,
       -0.85,  1.03,  1.14, -1.56,  1.51, -0.06, -0.4 , -0.25, -0.36,
       -0.44, -0.17, -0.7 ,  0.99,  1.63,  0.46,  0.13, -0.62, -1.41,
       -0.74, -1.3 , -0.81,  0.84, -0.77,  0.01,  0.16,  0.05, -1.07,
       -0.25, -0.36,  1.1 , -0.59,  1.18, -0.66, -0.1 , -1.19, -0.55,
        0.99,  1.36,  0.76, -0.74,  0.8 , -0.66,  0.61,  0.61, -1.34,
        0.13, -1.68,  1.81,  0.99, -0.06,  0.05,  1.89, -1.26,  0.88,
        1.63, -0.7 ,  1.36, -1.11, -0.4 , -0.17,  1.66, -1.07])

##### Show mean and standard deviations for all subsets

In [149]:
scaler.transform(X_train, X_test).mean()

-0.02

In [150]:
scaler.transform(y_train, y_test).std()



1.00

##### What's the difference between StandardScaler and scale, and why does it matter? 

##### Answer:

##### Create a new column for the below DataFrame that translates the sizes into suitable integer values using `.map()`.

In [162]:
df = pd.DataFrame([
['green', 'M', 10.1, 'class1'],
['red', 'L', 13.5, 'class2'],
['blue', 'XL', 15.3, 'class1'],
['orange', 'L', 13.6, 'class2']], 
columns=['color', 'size', 'price', 'class'])
df

Unnamed: 0,color,size,price,class
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1
3,orange,L,13.6,class2


In [163]:
size_to_integer = { 'M' : 2,
                    'L' : 3,
                    'XL' : 4}

In [164]:
df['size_1'] = df['size'].map(size_to_integer)

In [165]:
df

Unnamed: 0,color,size,price,class,size_1
0,green,M,10.1,class1,2
1,red,L,13.5,class2,3
2,blue,XL,15.3,class1,4
3,orange,L,13.6,class2,3
