# NumPy

### Chapter 2: Numpy

In [None]:
# focus on lists and dictionaries as they are most frequently used datatypes. 
# NumPy utilizes "ndarray" object, limitation it can handle only kind of data
# %timeit

In [1]:
import numpy as np
import time
import timeit
#import pyfits
import numpy.random as rand
import matplotlib.pyplot as mpl
import scipy.cluster.hierarchy as hy
import scipy.ndimage as ndimage
import skimage.morphology as morph
import skimage.exposure as skie
from skimage import filters as skie

from scipy import stats
from scipy.stats import norm, geom
from scipy.cluster import vq
from scipy.spatial import distance
from scipy.spatial.distance import pdist, squareform
from scipy.misc import imread, imsave
from scipy.sparse.linalg import eigsh
from scipy.linalg import eigh
import scipy.sparse


from scipy.optimize import curve_fit, fsolve
from scipy.interpolate import interp1d, UnivariateSpline, griddata
from scipy.interpolate import SmoothBivariateSpline as SBS
from scipy.integrate import quad, trapz

from mpl_toolkits.mplot3d  import Axes3D

from glob import glob

from sklearn import linear_model
from sklearn.datasets.samples_generator import make_regression
from sklearn.cluster import DBSCAN

In [None]:
# create an array with 10^7 elements
arr = np.arange(1e7)

# converting ndarray to list
larr = arr.tolist()

# Lists by default can't boradcast so a function is coded to emulate what an ndarray can do
def list_times(alist, scalar):
    for i, val in enumerate(alist):
        alist[i] = val *scalar
    return alist

# Using Python's %timeit to 
%timeit arr * 1.1

# using def
%timeit list_times(larr, 1.1)

In [None]:
# Create a 3D numpy array
arr = np.zeros((3,3,3))

# Trying to convert an array to matrix but it would fail
try:
    mat = np.matrix(arr)
except ValueError as e:
    print('Error: '+str(e))

In [None]:
#Array creation and data typing
#Create a list then wrap it as np array
alist = [1,2,3]
arr = np.array(list)

#Create an array with np elements
arr = np.zeros(5)

#Create and array going from 0 to 100
arr = np.zeros(100)

# or array from 10 to 100
arr = np.arange(10,100)

# 100 steps from 0 to 1
arr = np.linspace(0,1,100)

# logspace
arr = np.logspace(0,1,100, base = 10.0)

# Create an 5 X 5 array of zeros
arr = np.zeros((5,5))

# create a 5X5X5 cube of 1's
cube = np.zeros((5,5,5)).astype(int) + 1
cube = np.ones((5,5,5)).astype(np.float16)

# Array of zero integers
arr = np.zeros(2, dtype = int)
#Array of zero floats
arr = np.zeros(2, dtype = np.float32)

#Creating an array with elements from 0 to 999
arr1d = np.arange(1000)

#Now reshaping the array
arr3d = arr1d.reshape((10,10,10))

#or
arr3d = np.reshape(arr1d, (10,10,10))

#Inversely we can flatten arrays
arr4d = np.zeros((10,10,10,10))
arr1d = arr4d.ravel() #flattening the array'

In [None]:
# Record Arrays
# rec arrays
recarr = np.zeros((2,), dtype =('i4,f4,a10'))

col1 = np.arange(2) + 1
col2 = np.arange(2, dtype =np.float32)
col3 = ['Hello', 'World']

# Here we create a list of tuples that is identical to previous add to list
toadd = zip(col1, col2, col3)

#assigning values to recarr
recarr[:] =  toadd

recarr

#assigning names to each column which are now by default called  'fo', 'f1', 'f2'
recarr.dtype.names = ('Integers','Floats', 'Strings')

recarr['Integers']

In [None]:
## Indexing and slicing
alist = [[1,2],[3,4]]
alist[0][1]

#Converting the list defined above into an array
arr = np.array(alist)

#To return (0,1) element we use...
arr[0,1]

#Now to access last column
arr[:,1]

#Accessing bottom row
arr[1,:]

#Creating an array
arr = np.arange(5)

#Creating the index array
index = np.where(arr > 2)
print(index)

#Creating the indexarray
new_arr =  arr[index]

# remove/delete
new_arr = np.delete(arr, index)

index = arr > 2
print(index)
new_arr = arr[index]

~ index ## inverts the array

In [None]:
#Creating an image
img1 = np.zeros((20,20)) + 3
img1[4:-4,4:-4] = 6
img1[7:-7,7:-7] = 9

#filter out all values larger than 2 and less than 6
index1 = img1 > 2
index2 = img1 < 6
compound_index = index1 & index2

# The compound statement can alternatively be written as
compound_index = (img1 > 3) & (img1 < 7)
img2 = np.copy(img1)
img2[compound_index] = 0

#Making boolean arrays even more complex
index3 = img1 == 9
index4 = (index1 & index2) | index3
img3 = np.copy(img1)
img3[index4] = 0

In [None]:
#Creating a 100-element array with random values from a standard nromal ditbn or a gaussian distrbn 
#sigma is 1 and mean 0
a = rand.randn(100)

index = a > 0.2
b = a[index]

b = b ** 2 - 2

a[index]  =b

In [None]:
coeffs = np.array([[3,6,-5],[1,-3,2],[5,-1,4]])
values = np.array([[12] ,[-2], [10]])

coeff_mat = np.matrix(coeffs)
valu_mat = np.matrix(values)
coeff_matI = coeff_mat.getI()
ans = np.dot(coeff_matI,valu_mat)
ans

In [None]:
a= np.array([[3,6,-5],[1,-3,2],[5,-1,4]])
b = np.array([12 ,-2, 10])
#b= np.array([[12] ,[-2], [10]])

#Solving the variables
x = np.linalg.inv(a).dot(b) ##<<-----------------optimiized format 
print(x)


# SciPy

## Chapter 3: SciPy

In [None]:
# Page 17 - 41

In [None]:
#Data Modelling and Fitting


#Creating a function to model and create data
def func(x,a,b):
    return a*x + b

#Generating a clean data
x = np.linspace(0,10,100)
y = func(x,1,2)

#Adding noise to the data
yn = y + 0.9*np.random.normal(size= len(x))

#Executing the curve fit of noisy data
popt, pcov = curve_fit(func, x, yn)

#popt retunrs the best fit values for parameters of the given model (func)
print(popt)

In [None]:
#Gaussian progile, a non-linear function
def func(x,a,b,c):
    return a*np.exp(-(x-b)**2/(2*c**2))

#Generating clean data
x = np.linspace(0,10,100)
y = func(x,1,5,2)

#Adding noise to the data
yn = y+0.2*np.random.normal(size = len(x))

#Executing the curve fit of noisy data
popt, pcov = curve_fit(func, x, yn)

#popt retunrs the best fit values for parameters of the given model (func)
print(popt)

In [None]:
#Two Gaussian model
def func(x,a0,b0,c0,a1,b1,c1):
    return a0*np.exp(-(x-b0)**2/(2*c0**2))\
                + a1*np.exp(-(x-b1)**2/(2*c1**2))\

#Generating clean data
x = np.linspace(0,20,200)
y = func(x,1,3,1,-2,15,0.5)

#Adding noise to the data
yn = y+0.2*np.random.normal(size = len(x))

#Since we are fitting a more complex function, providing guesses for the fitting will lead to better results
guesses = [1,3,1,1,15,1]

#Executing curve_fit on noisy data
popt, pcov = curve_fit(func, x, yn, p0 = guesses)

print(popt)

In [None]:
#Defining function to simplify intersection solution
def findIntersection(func1, func2, x0):
    return fsolve(lambda x: func1(x) - func2(x),  x0)

#Defining functions that will intersect
funky = lambda x : np.cos(x/5) * np.sin(x/2)
line = lambda x : 0.01 * x  - 0.5

#Defining range and getting solutions on intersection points
x = np.linspace(0,45,10000)
result = findIntersection(funky, line, [15,20,30,35,40,45])

#Printing out results
print(result, line(result))

In [None]:
#Interpolation
#Setting up fake data
x = np.linspace(0,10 * np.pi, 20)
y = np.cos(x)

#Interploating data
f1 = interp1d(x, y, kind = 'linear')
fq = interp1d(x, y, kind = 'quadratic')

#x.mean and x.max are used to make sure we do not go beyond the boundaties of the data for the interpolation
xint = np.linspace(x.min(), x.max(), 1000)
yintl = f1(xint)
yintq = fq(xint)

In [None]:
sample = 30
x = np.linspace(1,10*np.pi,sample)
y = np.cos(x) + np.log10(x) + np.random.randn(sample)/10

#Interpolating the data
f = UnivariateSpline(x, y, s = 1)

#x.in and x.max are used to make sure we do not go beyond the boundaries of the data for the interpolation
xint = np.linspace(x.min(), x.max(), 1000)
yint = f(xint)

In [None]:
ripple = lambda x,y: np.sqrt(x**2 + y**2) + np.sin(x**2 + y**2)

#Generating griddled data 
grid_x, grid_y = np.mgrid[0:5:1000j, 0:5:100j]

#Generating sample that interpolation function will see
xy = np.random.rand(1000,2)
sample = ripple(xy[:,0]*5, xy[:,1]*5)

#Interpolating data with a cubic
grid_z0 = griddata(xy*5, sample, (grid_x, grid_y), method = 'cubic')

In [None]:
x, y = xy[:,0], xy[:,1]
sample = ripple(xy[:,0]*5, xy[:,1]*5)

#Interpolating the same data generted above using SBS
fit = SBS(x * 5, y * 5, sample, s = 0.01, kx =4, ky=4)
interp = fit(np.linspace(0,5,1000), np.linspace(0,5,1000))

In [None]:
#Integration
# Analytic Integration
func = lambda x: np.cos(np.exp(x)) **2

#Integrating function with upper and lower solution
solution = quad(func, 0, 3)
print(solution)

In [None]:
# Numerical Integration
x = np.sort(np.random.randn(150) * 4 + 4).clip(0,5)
func = lambda x: np.sin(x)* np.cos(x**2) + 1
y = func(x)

#Integrating the function with upper and lower limits of 0 and 5 respectively
fsolution = quad(func, 0, 5)
dsolution = trapz(y, x = x)
print('fsolution = ' + str(fsolution[0]))
print('dsolution =' + str(dsolution))
print('The difference is '+ str(np.abs(fsolution[0] - dsolution)))

In [None]:
#Statistics: 
#Construct a random array with 1000 elements
x = np.random.randn(1000)
#Calcluating the several of built in methods
mean = x.mean()
std = x.std()
var = x.var()
print(mean, std, var)

In [None]:
x = np.linspace(-5,5,1000)
# Here we set up parameters for the normal distribution where loc is the mean and scale is the standard deviation
dist = norm(loc =0, scale =1)

#Retrieving norms PDF and CDF
pdf = dist.pdf(x)
cdf = dist.cdf(x)

#Here we draw out 500 random variables
sample = dist.rvs(500)

In [None]:
p = 0.5
dist = geom(p)

#Set up simple range
x = np.linspace(0,5,1000)

#Retrieving geom's PMF & CDF
pmf = dist.pmf(x)
cdf = dist.cdf(x)

#Here we draw out 500 random values
sample = dist.rvs(500)

In [None]:
#Generating a normal distribution sample with 100 elements
sample =  np.random.randn(100)

#normal tests the null hypothesis
out = stats.normaltest(sample)
print('normaltest output')
print('Z-score = ' + str(out[0]))
print('Z-score = ' + str(out[1]))

#ks test is the Kolmogrov-Smirnove test for goodness of fit
# Here the sample is being tested against the normal distribution
# D is the KS statistinc and the closer it is to 0 the better it is
out = stats.kstest(sample,'norm')
print('\nkstest output for the Normal distribution')
print('D = '+str(out[0]))
print('P-value = '+str(out[1]))


#Similarly, this can be easily tested against other distributions like wald distribution
out = stats.kstest(sample,'wald')
print('\nkstest output for the Wald distribution')
print('D = '+str(out[0]))
print('P-value = '+str(out[1]))

In [None]:
out = stats.hmean(sample[sample > 0])
print("Harmonic mean = "+ str(out))

#This mean, where values below -1 and above 1 qre removed from mean calculation
out = stats.tmean(sample, limits=(-1,1))
print('\n Trimmed mean = '+ str(out))

In [None]:
#Calculating skewness of sample
out = stats.skew(sample)
print('\nSkewness = '+str(out))

#Addtionally there is handy summary function called describe which gives a quick look at data
out = stats.describe(sample)
print('\nSize = ' + str(out[0]))
print('Min = '+ str(out[1][0]))
print('Max = '+ str(out[1][1]))
print('Mean = '+ str(out[2]))
print('Variance = '+ str(out[3]))
print('Skewness = '+ str(out[4]))
print('Skewness = '+ str(out[5]))

In [None]:
#Spatial and Clustering Analysis: useful for organizing discovered information
# vector quantization, hierarchical clustering
#Creating data:
c1 = np.random.randn(100,2) + 5
c2 = np.random.randn(30,2) - 5
c3 = np.random.randn(50,2)

#pooling all the data into one 180X2 array
data = np.vstack([c1,c2,c3])

#Calculating the cluster centroids and variance from kmeans
centroids, variance = vq.kmeans(data,3)

#The identified variable contains the information we need to separate the points in clusters based on vq function
identified, distance = vq.vq(data, centroids)

#Retrieving coordinates for points in each vq identified core
vqc1 = data[identified ==0]
vqc2 = data[identified ==2]
vqc3 = data[identified ==3]

In [None]:
#Hierarchical Clustering
# Creating a cluster of clusters function
def clusters(number = 20, cnumber = 5, csize = 10):
    #Note that the way the clusters are positioned in Gaussian randomness:
    rnum = np.random.rand(cnumber,2)
    rn = rnum[:,0]*number
    rn = rn.astype(int)
    rn[np.where(rn < 5)] = 5
    rn[np.where(rn > number/2.)] = round(number / 2., 0)
    ra = rnum[:,1]*2.9
    ra[np.where(ra < 1.5)] = 1.5
    cls = np.random.randn(number, 3)*csize
    #Random multipliers for central point of cluster
    rxyz = np.random.randn(number-1,3)
    for i in xrange(cnumber, -1):
        tmp = np.randomrandn(rn[i+1],3)
        x = tmp[:,0] + (rxyz[i,0]*csize)
        y = tmp[:,0] + (rxyz[i,1]*csize)
        y = tmp[:,0] + (rxyz[i,2]*csize)
        tmp = np.column_stack([x,y,z])
        cls = np.vstack([cls,tmp])
    return cls

In [None]:
#Generate  cluster of clusters
cls = clusters()
D = pdist(cls[:,0:2])
D = squareform(D)

In [None]:
#Compute and plot first dendogram.
fig = mpl.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y1 = hy.linkage(D, method = 'complete')
cutoff = 0.3 * np.max(Y1[:,2])
Z1 = hy.dendrogram(Y1, orientation = 'right', color_threshold = cutoff)
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

In [None]:
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Y2 = hy.linkage(D, method = 'average')
cutoff = 0.3 * np.max(Y2[:,2])
Z2 = hy.dendrogram(Y2, color_threshold = cutoff)
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

In [None]:
#Plot distance matrix
ax3 = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
ax3.matshow(D, aspect = 'auto', origin = 'lower', cmap = mpl.cm.YlGnBu)
ax3.xaxis.set_visible(False)
ax3.yaxis.set_visible(False)

#Plot colorbar
fig.savefig("cluster_hy_01.pdf", bbox = 'tight')

In [None]:
#Save imports and cluster fruntion from previos example

def group(data, index):
    number = np.unique(index)
    groups = []
    for i in number:
        groups.append(data[index==i])
    return groups

# Creating a cluster of clusters
cls = clusters()

#calculating the linkage matrix
Y = hy.linkage(cls[:,0:2], method ='complete')

#Here we use the fclsuter function to pull out a collection of flat clusters from the hierarchical dats structure.
#Note that we are using the same cutoff values as in the previous example for the dendogram using the 'complete' method

cutoff = 0.3 * np.max(Y[:,2])
index = hy.fcluster(Y,cutoff,'distance')

groups = group(cls, index)

#Plotting clusters
fig = mpl.figure(figsize =(6,6))
ax = fig.add_subplot(111)
colors = ['r','c','b','g','orange','k','y','gray']
for i, g in enumerate(groups):
    i = np.mod(i, len(colors))
    ax.scatter(g[:,0], g[:,1], c= colors[i], edgecolor = 'none', s =50)
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)

    fig.savefig("cluster_hy_02.pdf", bbox = 'tight')

In [None]:
#Signal and Image processing:
# Getting the list of files in the directory
files = glob('imagery/WestAfrica-Alps_2017-12_imagery/**.jpg')
# Opening up the first image for loop
im1 = imread(files[0]).astype(np.float32)
# Starting loop and continue co-adding new images
for i in xrange(1, len(files)):
    print i
    im1 += imread(files[i]).astype(np.float32)
# Saving img
imsave('stacked_image.jpg', im1)

In [None]:
# This function allows us to place in the
# brightest pixels per x and y position between
# two images. It is similar to PIL's
# ImageChop.Lighter function.
def chop_lighter(image1, image2):
    s1 = np.sum(image1, axis=2)
    s2 = np.sum(image2, axis=2)
    
    index = s1 < s2
    image1[index, 0] = image2[index, 0]
    image1[index, 1] = image2[index, 1]
    image1[index, 2] = image2[index, 2]
    return image1

# Getting the list of files in the directory
#files = glob('space/*.JPG')
# Opening up the first image for looping
im1 = imread(files[0]).astype(np.float32)
im2 = np.copy(im1)

# Starting loop
for i in xrange(1, len(files)):
    print i
    im = imread(files[i]).astype(np.float32)
    # Same before
    im1 += im
    # im2 shows star trails better
    im2 = chop_lighter(im2, im)

# Saving image with slight tweaking on the combination
# of the two images to show star trails with the
# co-added image.
imsave('stacked_image_r2.jpg', im1/im1.max() + im2/im2.max()*0.2)

In [None]:
#Sparse Matrices
N = 3000
# Creating a random sparse matrix
m = scipy.sparse.rand(N, N)
# Creating an array clone of it
a = m.toarray()
print('The numpy array data size: ' + str(a.nbytes) + ' bytes')
print('The sparse matrix data size: ' + str(m.data.nbytes) + ' bytes')
# Non-sparse
t0 = time.time()

res1 = eigh(a)
dt = str(np.round(time.time() - t0, 3)) + ' seconds'
print('Non-sparse operation takes ' + dt)
# Sparse
t0 = time.time()
res2 = eigsh(m)
dt = str(np.round(time.time() - t0, 3)) + ' seconds'
print('Sparse operation takes ' + dt)


# SciKit

### Chapter 4: SciKit: Taking Scipy One Step Further 

### Scikit Image

In [None]:
#Page 43 - 53
## Scikit-Image
#  Scikit-image has fortunately taken on
#  the task of going a step further to provide more advanced functions that we may
#  need for scientific research. These advanced and high-level modules include color
#  space conversion, image intensity adjustment algorithms, feature detections, filters for
#  sharpening and denoising, read/write capabilities, and more

In [None]:
# Generating data points with a non-uniform background
x = np.random.uniform(low=0, high=100, size=20).astype(int)
y = np.random.uniform(low=0, high=100, size=20).astype(int)
# Creating image with non-uniform background
func = lambda x, y: x**2 + y**2
grid_x, grid_y = np.mgrid[-1:1:100j, -2:2:100j]
bkg = func(grid_x, grid_y)
bkg = bkg / np.max(bkg)
# Creating points
clean = np.zeros((100,100))
clean[(x,y)] += 5
clean = ndimage.gaussian_filter(clean, 3)
clean = clean / np.max(clean)
# Combining both the non-uniform background
# and points
fimg = bkg + clean
fimg = fimg / np.max(fimg)
# Defining minimum neighboring size of objects
block_size = 3
# Adaptive threshold function which returns image
# map of structures that are different relative to
# background
adaptive_cut = skif.threshold_adaptive(fimg, block_size, offset=0)

# Global threshold
global_thresh = skif.threshold_otsu(fimg)
global_cut = fimg > global_thresh
# Creating figure to highlight difference between
# adaptive and global threshold methods
fig = mpl.figure(figsize=(8, 4))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
ax1 = fig.add_subplot(131)
ax1.imshow(fimg)
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = fig.add_subplot(132)
ax2.imshow(global_cut)
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax3 = fig.add_subplot(133)
ax3.imshow(adaptive_cut)
ax3.xaxis.set_visible(False)
ax3.yaxis.set_visible(False)
fig.savefig('scikit_image_f01.pdf', bbox_inches='tight')

In [None]:
#Local Maxima Page 45
# Generating data points with a non-uniform background
x = np.random.uniform(low=0, high=200, size=20).astype(int)
y = np.random.uniform(low=0, high=400, size=20).astype(int)
# Creating image with non-uniform background
func = lambda x, y: np.cos(x)+ np.sin(y)
grid_x, grid_y = np.mgrid[0:12:200j, 0:24:400j]
bkg = func(grid_x, grid_y)
bkg = bkg / np.max(bkg)


# Creating points
clean = np.zeros((200,400))
clean[(x,y)] += 5
clean = ndimage.gaussian_filter(clean, 3)
clean = clean / np.max(clean)
# Combining both the non-uniform background
# and points
fimg = bkg + clean
fimg = fimg / np.max(fimg)
# Calculating local maxima
lm1 = morph.local_maxima(fimg)
x1, y1 = np.where(lm1.T == True)
# Creating figure to show local maximum detection
# rate success
fig = mpl.figure(figsize=(8, 4))
ax = fig.add_subplot(111)
ax.imshow(fimg)
ax.scatter(x1, y1, s=100, facecolor='none', edgecolor='#009999')
ax.set_xlim(0,400)
ax.set_ylim(0,200)
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
fig.savefig('scikit_image_f02.pdf', bbox_inches='tight')

In [None]:
# Loading astronomy image from an infrared space telescope
img = pyfits.getdata('stellar_cluster.fits')[500:1500, 500:1500]
# Prep file scikit-image environment and plotting
limg = np.arcsinh(img)
limg = limg / limg.max()
low = np.percentile(limg, 0.25)
high = np.percentile(limg, 99.5)
opt_img = skie.exposure.rescale_intensity(limg, in_range=(low, high))
# Calculating local maxima and filtering out noise
lm = morph.is_local_maximum(limg)
x1, y1 = np.where(lm.T == True)
v = limg[(y1, x1)]
lim = 0.5
x2, y2 = x1[v > lim], y1[v > lim]
# Creating figure to show local maximum detection
# rate success
fig = mpl.figure(figsize=(8,4))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
ax1 = fig.add_subplot(121)
ax1.imshow(opt_img)
ax1.set_xlim(0, img.shape[1])
ax1.set_ylim(0, img.shape[0])
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = fig.add_subplot(122)
ax2.imshow(opt_img)
ax2.scatter(x2, y2, s=80, facecolor='none', edgecolor='#FF7400')
ax2.set_xlim(0, img.shape[1])
ax2.set_ylim(0, img.shape[0])
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [None]:
# Scikit-Learn
# Linear Regression
# Generating synthetic data for training and testing
X, y = make_regression(n_samples=100, n_features=2, n_informative=1,\
random_state=0, noise=50)
# X and y are values for 3D space. We first need to train
# the machine, so we split X and y into X_train, X_test,
# y_train, and y_test. The *_train data will be given to the
# model to train it.
X_train, X_test = X[:80], X[-20:]
y_train, y_test = y[:80], y[-20:]

# Creating instance of model
regr = linear_model.LinearRegression()
# Training the model
regr.fit(X_train, y_train)
# Printing the coefficients
print(regr.coef_)
# [-10.25691752 90.5463984 ]
# Predicting y-value based on training
X11 = np.array([1.2, 4])
X1 = X11.reshape(1,-1) # <<-- correction step
print(regr.predict(X1))
# 350.860363861
# With the *_test data we can see how the result matches
# the data the model was trained with.
# It should be a good match as the *_train and *_test
# data come from the same sample. Output: 1 is perfect
# prediction and anything lower is worse.
print(regr.score(X_test, y_test))
# 0.949827492261
fig = mpl.figure(figsize=(8, 5))
ax = fig.add_subplot(111, projection='3d')
# ax = Axes3D(fig)
# Data
ax.scatter(X_train[:,0], X_train[:,1], y_train, facecolor='#00CC00')
ax.scatter(X_test[:,0], X_test[:,1], y_test, facecolor='#FF7800')
# Function with coefficient variables
coef = regr.coef_
line = lambda x1, x2: coef[0] * x1 + coef[1] * x2
grid_x1, grid_x2 = np.mgrid[-2:2:10j, -2:2:10j]
ax.plot_surface(grid_x1, grid_x2, line(grid_x1, grid_x2),
alpha=0.1, color='k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.zaxis.set_visible(False)
fig.savefig('scikit_learn_regression.pdf', bbox='tight')

In [4]:
# Clustering:
# Creating data
c1 = np.random.randn(100, 2) + 5
c2 = np.random.randn(50, 2)
# Creating a uniformly distributed background
u1 = np.random.uniform(low=-10, high=10, size=100)
u2 = np.random.uniform(low=-10, high=10, size=100)
c3 = np.column_stack([u1, u2])
# Pooling all the data into one 150 x 2 array
data = np.vstack([c1, c2, c3])
# Calculating the cluster with DBSCAN function.
# db.labels_ is an array with identifiers to the
# different clusters in the data.
db = DBSCAN().fit(data, eps=0.95, min_samples=10)
labels = db.labels_
# Retrieving coordinates for points in each
# identified core. There are two clusters
# denoted as 0 and 1 and the noise is denoted
# as -1. Here we split the data based on which
# component they belong to.
dbc1 = data[labels == 0]
dbc2 = data[labels == 1]
noise = data[labels == -1]
# Setting up plot details
x1, x2 = -12, 12
y1, y2 = -12, 12
fig = mpl.figure()
fig.subplots_adjust(hspace=0.1, wspace=0.1)
ax1 = fig.add_subplot(121, aspect='equal')
ax1.scatter(c1[:,0], c1[:,1], lw=0.5, color='#00CC00')
ax1.scatter(c2[:,0], c2[:,1], lw=0.5, color='#028E9B')
ax1.scatter(c3[:,0], c3[:,1], lw=0.5, color='#FF7800')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax1.set_xlim(x1, x2)
ax1.set_ylim(y1, y2)
ax1.text(-11, 10, 'Original')
ax2 = fig.add_subplot(122, aspect='equal')
ax2.scatter(dbc1[:,0], dbc1[:,1], lw=0.5, color='#00CC00')
ax2.scatter(dbc2[:,0], dbc2[:,1], lw=0.5, color='#028E9B')
ax2.scatter(noise[:,0], noise[:,1], lw=0.5, color='#FF7800')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax2.set_xlim(x1, x2)
ax2.set_ylim(y1, y2)
ax2.text(-11, 10, 'DBSCAN identified')
fig.savefig('scikit_learn_clusters.pdf', bbox_inches='tight')

TypeError: fit() got an unexpected keyword argument 'eps'

In [None]:
DBSCAN().fit