In [1]:
import pandas as pd

## Guiding principles

The package `sklearn` has the vast majority of simple and moderately complex techinques compiled into one place, already optimized for speed and disrtiuted computing.

Let's load in a dataset so that we can compare several techniques. Included in your download of `sklearn` are several datasets for practice, so let's focus on the Boston housing proce data:

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)  #the data starts off as a 2d numpy array
targets = pd.DataFrame(boston.target)   #the targets start off as a numpy array
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


Personally, I like to have a cell where I do all of my data cleaning and separate it from my import step:

In [3]:
boston.DESCR



In [4]:
# The CHAS column indicates whether the property borders the Charles river
df.CHAS = df.CHAS.astype('bool')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,False,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,False,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,False,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,False,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,False,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


Let's give ourselves a training and test split:

In [5]:
train_x = df[df.index < 402]
train_y = targets[targets.index < 402]  
print(train_x.shape)
print(train_y.shape)

test_x = df[df.index >= 402]
test_y = targets[targets.index >= 402]  
print(test_x.shape)
print(test_y.shape)


(402, 13)
(402, 1)
(104, 13)
(104, 1)


All of the data lines up and we're using all 506 rows without any randomization or sampling

## Now, let's do some machine learning

In [6]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()    # Parameters passed to the model would go here
model.fit(train_x, train_y)
preds = model.predict(test_x)

We needed to pass in a one-column dataframe to the model to get it to run - `sklearn` is peppered with places where it expects 1d arrays versus 2d arrays which have only one column. Let's make it a Series for our use and to get a measure of fit

In [7]:
preds = pd.DataFrame(preds).iloc[:,0]
test_y = pd.DataFrame(test_y).reset_index().iloc[:,1]

rmse = ((test_y - preds)**2).mean()
rmse

34.74785187584871

That's a pretty poor result, given that min(targets) is 5 and max(targets) is 50 - we're off by a lot! Let's look at the model:

In [8]:
pd.DataFrame(model.coef_, columns = train_x.columns)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.200476,0.044246,0.053886,1.814954,-14.973226,4.790741,0.002981,-1.298891,0.472508,-0.015506,-0.810878,-0.001861,-0.529321


Let's cross validate this instead:

In [9]:
from sklearn.linear_model import ElasticNetCV
model = ElasticNetCV()    # the default is for 100 values of alpha
model.fit(train_x, train_y.iloc[:,0])
preds = model.predict(test_x)

preds = pd.DataFrame(preds).iloc[:,0]

rmse = ((test_y - preds)**2).mean()
rmse

20.461496359476996

In [10]:
pd.DataFrame(model.coef_.reshape(1,-1), columns = train_x.columns) #reshape takes our 1d array and transposes it

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.11181,0.054688,-0.0,0.0,-0.0,1.536801,0.025337,-0.874584,0.428578,-0.019975,-0.723803,0.009126,-0.804585


Now, let's look at some cross validated ridge regression:

In [11]:
from sklearn.linear_model import Ridge
model = Ridge()   
model.fit(train_x, train_y.iloc[:,0])
preds = model.predict(test_x)

preds = pd.DataFrame(preds).iloc[:,0]

rmse = ((test_y - preds)**2).mean()
rmse

32.44853185269181

In [12]:
pd.DataFrame(model.coef_.reshape(1,-1), columns = train_x.columns) #reshape takes our 1d array and transposes it

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.196403,0.04561,0.022446,1.73557,-7.891178,4.807516,-0.002397,-1.206595,0.455834,-0.016502,-0.735022,0.000463,-0.54011


## Hardier ML methods

In [13]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()    
model.fit(train_x, train_y.iloc[:,0].values)
preds = model.predict(test_x)

preds = pd.DataFrame(preds).iloc[:,0]

rmse = ((test_y - preds)**2).mean()
rmse

16.655170192307693

The default is for RandomForestRegressor to grow 10 trees, let's grow more than that

In [14]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10000)    
model.fit(train_x, train_y.iloc[:,0].values)
preds = model.predict(test_x)

preds = pd.DataFrame(preds).iloc[:,0]

rmse = ((test_y - preds)**2).mean()
rmse

18.081233938670167

There's some improvement, but not a lot. Sad! Let's train a neural network

## Neural Networks
currently still working on this one, keras is being obstinate

In [15]:
str(train_x.shape)

'(402, 13)'

In [None]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=13, init='normal', activation='relu'))
	model.add(Dense(1, init='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

In [None]:
train_x.shape

In [None]:
seed = 7
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)



kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, train_x, train_y.iloc[:,0], cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))


In [17]:
# generate dummy data
import numpy as np
#data = np.random.random((1000, 784))
data = train_x
#labels = np.random.randint(2, size=(1000, 1))
labels = train_y

from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential()
model.add(Dense(1, input_shape=data.shape, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])



# train the model, iterating on the data in batches
# of 32 samples
model.fit(data, labels, nb_epoch=10, batch_size=32)

ValueError: Error when checking model input: expected dense_input_1 to have 3 dimensions, but got array with shape (402, 13)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential()
model.add(Dense(32, batch_input_shape=(None, 13)))
model.add(Activation("relu"))
model.add(Dense(output_dim=32))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

from keras.optimizers import SGD
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.01, momentum=0.9, nesterov=True))

model.fit(train_x, train_y, nb_epoch=5, batch_size=32)

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=32)
model.add(Activation("relu"))
model.add(Dense(output_dim=402))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

from keras.optimizers import SGD
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.01, momentum=0.9, nesterov=True))

model.fit(train_x, train_y, nb_epoch=5, batch_size=32)

loss_and_metrics = model.evaluate(test_x, test_y, batch_size=32)

classes = model.predict_classes(test_x, batch_size=32)
proba = model.predict_proba(test_x, batch_size=32)

In [None]:
import numpy as np 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils

from keras.datasets import mnist
 
# Load pre-shuffled MNIST data into train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()


In [None]:
X_train

In [None]:
from matplotlib import pyplot as plt
plt.imshow(X_train[0])
plt.show()