In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
clrs = np.array(['#003057', '#EAAA00', '#4B8B9B', '#B3A369', '#377117', '#1879DB', '#8E8B76', '#F5D580', '#002233', '#808080'])

Dataset used in the course

In [None]:
# perovskite_data
df = pd.read_csv('data/perovskite_data.csv')

In [None]:
#dow impurity data 
df = pd.read_excel('data/impurity_dataset-training.xlsx')
dow_df = df[['Date', 'y:Impurity']]
dow_df.loc[:,'Date'] = pd.to_datetime(dow_df['Date'])
dow_df = dow_df.set_index('Date')

In [None]:
dow_df['01/01/2016 05:00:00':'01/01/2016 12:00:00']  # select data based on dates
dow_df.plot(); # plot data

In [None]:
#Data can be "filtered" using logical statements:
bools = df['x1:Primary Column Reflux Flow'] > 350
print(bools)
df_filtered = df[bools]
df_filtered.head(3)

In [None]:
# get correlation matrix for the impurity data and find highly correlated features 
import seaborn as sns

fig, ax = plt.subplots(figsize = (17, 15), dpi = 150)

corr = df.corr()
sns.heatmap(corr, ax = ax);

corr["Avg_Delta_Composition Primary Column"] > 0.95

In [None]:
# drop values 
# Let's use another useful method called drop() to drop Avg_Delta_Composition Primary Column
df_no_avg_delta = df_dropped_obs.drop('Avg_Delta_Composition Primary Column', axis = 1)

In [None]:
# get outliers from dataframe
xi = df_dropped_obs["x3:Input to Primary Column Bed 3 Flow"].copy()
mu = np.mean(xi)
stdev = np.std(xi)
z_cutoff = 3

zi = (xi - mu)/stdev
xi_nooutliers = xi[np.abs(zi) < z_cutoff]
print('Observations before removing outliers: {}'.format(xi.shape[0]))
print('Observations after removing outliers: {}'.format(xi_nooutliers.shape[0]))

In [None]:
import pandas as pd

df = pd.read_excel('data/impurity_dataset-training.xlsx')
def is_real_and_finite(x):
    if not np.isreal(x):
        return False
    elif not np.isfinite(x):
        return False
    else:
        return True

all_data = df[df.columns[1:]].values #drop the first column (date)
numeric_map = df[df.columns[1:]].applymap(is_real_and_finite)
real_rows = numeric_map.all(axis=1).copy().values #True if all values in a row are real numbers
X_dow = np.array(all_data[real_rows,:-5], dtype='float') #drop the last 5 cols that are not inputs
y_dow = np.array(all_data[real_rows,-3], dtype='float')
y_dow = y_dow.reshape(-1,1)
print(X_dow.shape, y_dow.shape)

Hierarchical data format 


In [None]:
#! pip install h5py
! rm data/impurity_data.hdf5
import h5py
import numpy as np

f = h5py.File("data/impurity_data.hdf5", "w") #<- the "w" argument tells h5py to create a new file. "w" stands for "write"
dset = f.create_dataset("training", X_dow_numbers.shape)
dset[:, :] = X_dow_numbers

In [None]:
f.close()
f2.close()

Online data access

In [None]:
import requests

page = requests.get('https://pubchem.ncbi.nlm.nih.gov/compound/Ethanol')

JSON file

In [None]:
import json

with open('data/ethanol.json') as f:
    etoh = json.load(f)

SMILES string

In [None]:
SMILES = etoh['Record']['Section'][2]['Section'][1]['Section'][3]['Information'][0]['Value']['StringWithMarkup'][0]['String']#['StringValue']
MW = etoh['Record']['Section'][3]['Section'][0]['Section'][0]['Information'][0]['Value']['Number'][0]
print('SMILES: {}'.format(SMILES))
print('Molecular Weight: {}'.format(MW))

In [None]:
# extract the same information with significantly less effort:
SMILES = etoh_simple['PC_Compounds'][0]['props'][18]['value']['sval']
MW = etoh_simple['PC_Compounds'][0]['props'][17]['value']['fval']
print('SMILES: {}'.format(SMILES))
print('Molecular Weight: {}'.format(MW))

In [None]:
#Use the ethanol_simple.json file as input. You will need both bonds and atoms information. 
#Note that element refers to the atomic number (e.g. hydrogen is 1).

print(etoh_simple['PC_Compounds'][0]['bonds'])
print(etoh_simple['PC_Compounds'][0]['atoms'])

Application Programming Interfaces (APIs)

RESTful API's

python API

In [None]:
import pubchempy as pcpy
#help(pcpy)

In [None]:
compounds = pcpy.get_compounds('Ethanol','name')
print(compounds)
etoh = compounds[0]
print(etoh.bonds[0].aid2)
print(etoh.atoms[etoh.bonds[0].aid1].element)
print(etoh.atoms[etoh.bonds[0].aid2].element)

We can also use the PubChemPy API to ask for specific attributes with the `get_properties` method:

In [None]:
p = pcpy.get_properties('CanonicalSMILES', 'ethanol', 'name')
print(p)

In [None]:
#CO2 data
import statsmodels.api as api

sm_data = api.datasets.co2.load_pandas()
co2_df = sm_data.data
co2_df.plot();

In [None]:
#MNIST dataset
from sklearn.datasets import load_digits

digits = load_digits()
print("Digits data shape: {}".format(digits.data.shape))
print("Digits output shape: {}".format(digits.target.shape))
X_mnist = np.array(digits.data)
y_mnist = np.array(digits.target)

Regression models 

General Linear Regression

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

df = pd.read_csv('data/ethanol_IR.csv')
x_all = df['wavenumber [cm^-1]'].values
y_all = df['absorbance'].values

x_peak = x_all[475:575]
y_peak = y_all[475:575]

m = 20

x_peak = x_peak.reshape(-1, 1) #create a column vector
X_vdm = vandermonde(x_peak, m) #generate Vandermonde matrix
b_m = np.dot(X_vdm.T, y_peak) #generate b vector with new features
A_m = np.dot(X_vdm.T, X_vdm) #generate A matrix with new features
w_m = np.linalg.solve(A_m, b_m) #solve Ax=b with new features

yhat_m = np.dot(X_vdm, w_m) #compute predictions
SSE_m = np.sum((y_peak - yhat_m)**2) #compute sum of squared errors
print('Sum of Squared Errors: {}'.format(SSE_m))

Linear Regression

In [None]:
A = X.T@X
b = X.T@y
w_lsr = np.linalg.solve(A,b)

yhat = X@w_lsr
print('Weights from least-squares regression: {}'.format(w_lsr))
print('Original weights to generate data: {}'.format(w))

In [None]:
acurracy metrics 
#r2=(SST-SSE)/SST

In [None]:
# Add intercept 

X[:,-1] += 1

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=False) #create a linear regression model instance (no intercept needed)
model.fit(X, y_peak) #fit the model
r2 = model.score(X, y_peak) #get the "score", which is equivalent to r^2

yhat = model.predict(X) #create the model prediction

fig, ax = plt.subplots()
ax.plot(x_peak, y_peak, '.')
ax.plot(x_peak, yhat, 'o', markerfacecolor='none')
ax.set_xlabel('wavenumber [$cm^{-1}$]')
ax.set_ylabel('absorbance')
ax.set_title('IR spectra data')
ax.legend(['Original Data', 'Linear Regression'])
print('r^2 = {}'.format(r2))

non-linear regression 

In [None]:
# Automatic Differentiation 
import autograd.numpy as np   # autograd has its own "version" of numpy that must be used
from autograd import grad # the "grad" function provides derivatives

def g(lamda, x=x, y=y, m=2):
    return gaussian_loss(lamda, x, y, m)

diff_g = grad(g)
print(g(lamda))
print(diff_g(lamda))
diff_g

In [None]:
#gradient descent
bad_guess = [0.1, 1.0, 0.5, 0.3, 0.1, 0.4]
better_guess = [0.35, 0.75, 0.21, 0.52, 0.53, 0.11]
guess = bad_guess

N_iter = 1000
h = 0.1
for i in range(N_iter):
    guess = guess - h*np.array(diff_g(guess))

In [None]:
# optimization with scipy 
from  scipy.optimize  import minimize

result = minimize(g, bad_guess, method='BFGS')
result

result.x

print('Actual Input: {}'.format(str(result.x)))
print('Regression Result: {}'.format(str(lamda)))

Kernel Regression¶

In [None]:
rbf function manual 

In [None]:
def rbf(x_train, x_test=None, gamma=1):
    if x_test is None:
        x_test = x_train
    N = len(x_test) #<- number of data points
    M = len(x_train) #<- number of features
    X = np.zeros((N,M))
    for i in range(N):
        for j in range(M):
            X[i,j] = np.exp(-gamma*(x_test[i] - x_train[j])**2)
    return X

sigma = 100
gamma = 1./(2*sigma**2)
x_test = np.linspace(min(x_peak), max(x_peak), 300)
X_rbf = rbf(x_peak, x_test=x_test, gamma=gamma)

fig, ax = plt.subplots()
ax.plot(x_test, X_rbf[:,50], '-')
ax.set_xlabel('wavenumber [$cm^{-1}$]')
ax.set_ylabel('absorbance')
ax.set_title('rbf basis $\sigma$ = {}'.format(str(sigma)));

In [None]:
X_train = rbf(x_peak, gamma=gamma)

model_rbf = LinearRegression() #create a linear regression model instance
model_rbf.fit(X_train, y_peak) #fit the model
r2 = model_rbf.score(X_train, y_peak) #get the "score", which is equivalent to r^2
print('r^2 = {}'.format(r2))

X_test = rbf(x_peak, x_test=x_test, gamma=gamma)

yhat_rbf = model_rbf.predict(X_test) #create the model prediction

fig, ax = plt.subplots()
ax.plot(x_peak, y_peak, 'o')
ax.plot(x_test, yhat_rbf, '-', markerfacecolor='none')
ax.set_xlabel('wavenumber [$cm^{-1}$]')
ax.set_ylabel('absorbance')
ax.set_title('kernel regression $\sigma$ = {}'.format(str(sigma)))
ax.legend(['Original Data', 'Linear Regression']);

In [None]:
spacing = 3
sigma = 10
gamma = 1./(2*sigma**2)

x_train = x_peak[::spacing]
y_train = y_peak[::spacing]

X_train = rbf(x_train, gamma=gamma)

model_rbf = LinearRegression() #create a linear regression model instance
model_rbf.fit(X_train, y_train) #fit the model

r2 = model_rbf.score(X_train, y_train)

Cross validation (test train split)

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0)

x_train, x_test, y_train, y_test = train_test_split(x_peak, y_peak, test_size=0.4)


k-fold Cross Validation

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)
sigma = 100
gamma = 1. / 2 / sigma**2

fig, ax = plt.subplots()
ax.plot(x_peak, y_peak, '-o', markerfacecolor='none')

r2_test = []

for train_index, test_index in kf.split(x_peak):
    x_train, x_test = x_peak[train_index], x_peak[test_index]
    y_train, y_test = y_peak[train_index], y_peak[test_index]
    
    X_train = rbf(x_train, gamma=gamma)

    model_rbf = LinearRegression() #create a linear regression model instance
    model_rbf.fit(X_train, y_train) #fit the model
    r2 = model_rbf.score(X_train, y_train) #get the "score", which is equivalent to r^2
    print('r^2 training = {}'.format(r2))

    X_test = rbf(x_train, x_test=x_test, gamma=gamma)

    yhat_rbf = model_rbf.predict(X_test) #create the model prediction

    r2 = model_rbf.score(X_test, y_test) #get the "score", which is equivalent to r^2
    print('r^2 testing = {}'.format(r2))
    r2_test.append(r2)
    

Resampling Bootstrapping

In [None]:
from numpy.random import choice #<- randomly select items from a list

def bootstrap_linregress(x_all, y_all, N):
    m_list = []
    b_list = []
    for n in range(N):
        subset = choice(range(len(x_all)), size=len(x_all), replace=True)
        xprime = [x_all[j] for j in subset]
        yprime = [y_all[j] for j in subset]
        if np.std(xprime) > 0:
            m, b = np.polyfit(xprime, yprime, deg=1)
        else:
            m = 0
            b = np.mean(yprime)
        
        m_list.append(m)
        b_list.append(b)
    return m_list, b_list

Gaussian Process Regression

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

x_peak = x_peak.reshape(-1, 1)
y_peak = y_peak.reshape(-1, 1)
x_train, x_test, y_train, y_test = train_test_split(x_peak, y_peak, test_size = 0.4)

gpr = GaussianProcessRegressor(kernel = RBF(1), alpha = 0.000005)

gpr.fit(x_train, y_train)

y_gpr, y_std = gpr.predict(x_peak, return_std = True)

Complexity Optimization 

In [None]:
#BIC 
def BIC(y, yhat, k):
    err = y - yhat
    sigma = np.std(np.real(err))
    n = len(y)
    B = n*np.log(sigma**2) + k*np.log(n)
    return B

from sklearn.linear_model import LinearRegression

def polynomial_features(x, N):
    # function to return a matrix of polynomials for x to order N
    # One-liner uses "list comprehension" to iterate through range 0 - N (note N+1 since range function is not inclusive)
    # The input, x, is raised to the power of N for each value of N
    # The result is converted to an array and transposed so that columns correspond to features and rows correspond to data points (individual x values)
    return np.array([x**k for k in range(0,N)]).T

N = 40
X_poly = polynomial_features(x_peak, N)

LR_poly = LinearRegression() #create a linear regression model instance
LR_poly.fit(X_poly, y_peak) #fit the model
yhat_poly = LR_poly.predict(X_poly)

BIC_poly = BIC(y_peak, yhat_poly, N)

Regularization

In [None]:
from sklearn.kernel_ridge import KernelRidge
sigma = 10
gamma = 1./(2*sigma**2)

alpha = 0.1

KRR = KernelRidge(alpha=alpha, kernel='rbf', gamma=gamma)
x_peak = x_peak.reshape(-1,1) #we need to convert these to columns
y_peak = y_peak.reshape(-1,1)

KRR.fit(x_peak, y_peak)

x_predict = np.linspace(min(x_peak), max(x_peak), 300) #create prediction data
yhat_KRR = KRR.predict(x_predict)

r2_test = KRR.score(x_test, y_test)

coeffs= KRR.dual_coef_

In [None]:
LASSO

In [None]:
from sklearn.linear_model import Lasso

sigma = 10
gamma = 1./(2*sigma**2)

alpha = 1e-4

LASSO = Lasso(alpha=alpha)
LASSO.fit(X_train, y_train)
print('The number of coefficients: {}'.format(len(LASSO.coef_)))

x_predict = np.linspace(min(x_peak), max(x_peak), 300) #create prediction data
X_predict = rbf_kernel(x_predict, x_train, gamma=gamma)

yhat_LASSO = LASSO.predict(X_predict)


coeffs = LASSO.coef_

In [None]:
nonzero = [f for f in np.isclose(coeffs,0) if f == False]
print('Total number of non-zero parameters: {}'.format(len(nonzero)))

Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

sigmas = np.array([5, 10, 15, 20, 25, 30,35, 40])
gammas = 1./(2*sigmas**2)

alphas = np.array([1e-9, 1e-5, 1e-4,1e-3, 1e-2,1e-1, 1])

parameter_ranges = {'alpha':alphas, 'gamma':gammas}

KRR = KernelRidge(kernel='rbf')

KRR_search = GridSearchCV(KRR, parameter_ranges, cv=3)
KRR_search.fit(x_train,y_train)
KRR_search.best_estimator_, KRR_search.best_score_

yhat_KRR = KRR_search.best_estimator_.predict(x_predict)

bootstrap

In [None]:
from numpy.random import choice #<- randomly select items from a list

def bootstrap_linregress(x_all, y_all, N):
    m_list = []
    b_list = []
    for n in range(N):
        subset = choice(range(len(x_all)), size=len(x_all), replace=True)
        xprime = [x_all[j] for j in subset]
        yprime = [y_all[j] for j in subset]
        if np.std(xprime) > 0:
            m, b = np.polyfit(xprime, yprime, deg=1)
        else:
            m = 0
            b = np.mean(yprime)
        
        m_list.append(m)
        b_list.append(b)
    return m_list, b_list

High Dimensional Regression

In [None]:
#feature visulization histogram
print('X dimensions: {}'.format(X.shape))
print('Feature names: {}'.format(x_names))
N = X.shape[-1]
n = int(np.sqrt(N))
fig, axes = plt.subplots(n, n+1, figsize = (6*n, 6*n))
ax_list = axes.ravel()
for i in range(N):
    ax_list[i].hist(X[:,i])
    ax_list[i].set_xlabel(x_names[i])

In [None]:
# feature correlations 
covar = np.cov(X.T)
fig,ax = plt.subplots()
c = ax.imshow(covar)
fig.colorbar(c);

Scaling Features and Outputs

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() 
train_std=sc.transform(X_train)
test_std=sc.transform(X_test)

In [None]:
from sklearn import preprocessing
import numpy as np

X_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)
scaler
StandardScaler()

In [None]:
X_scaled = (X - X.mean(axis=0))/X.std(axis=0)
print("Minimum: {}, Maximum: {}".format(X.min(), X.max()))
print("Minimum scaled: {}, Maximum scaled: {}".format(X_scaled.min(), X_scaled.max()))

Dimensionality Reduction 
Forward selection 

In [None]:
# forward selection
N_features = 40
X_subset = X_scaled.copy()
x_names_subset = np.copy(x_names)
new_X = []
new_X_names = []

while len(new_X) < N_features:
    r2_list = []
    for j in range(X_subset.shape[1]):
        model = LinearRegression() #create a linear regression model instance
        xj = X_subset[:,j].reshape(-1,1)
        model.fit(xj, y) #fit the model
        r2 = model.score(xj, y) #get the "score", which is equivalent to r^2
        r2_list.append([r2, j])
    r2_list.sort() #sort lowest to highest
    r2_max, j_max = r2_list[-1] #select highest r2 value
    new_X.append(X_subset[:,j_max].copy())
    new_X_names.append(x_names_subset[j_max])
    x_names_subset = np.delete(x_names_subset, j_max)
    X_subset = np.delete(X_subset, j_max, axis=1)
    
print('The {} most linearly correlated features are: {}'.format(N_features, new_X_names))

new_X = np.array(new_X).T

In [None]:
# Principal component regression
PCvals, PCvecs = eigvals, eigvecs
total_variance = np.sum(np.real(PCvals))
explained_variance = np.real(PCvals)/total_variance

PC_projection = np.dot(X_scaled, PCvecs)
print(PC_projection.shape)

corr_PCs = np.corrcoef(PC_projection.T)


y = np.array(all_data[real_rows, -3], dtype = 'float')
y = y.reshape(-1, 1)
model = LinearRegression() #create a linear regression model instance
model.fit(PC_projection, y) #fit the model
r2 = model.score(PC_projection, y) #get the "score", which is equivalent to r^2

Classification models

ROC Receiver Operating Characteristic (ROC) curves

RandomForestClassifier

In [None]:
from sklearn.metrics import roc_curve
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

svc = SVC()
sgd = SGDClassifier()
rf = RandomForestClassifier()

sgd.fit(X_blob2, y_blob2)
y_sgd = sgd.predict(X_blob2)

rf.fit(X_blob2, y_blob2)
y_rf = rf.predict(X_blob2)

fpr, tpr, threshold = roc_curve(y_blob2, y_rf)

fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot(fpr, fpr, '#C0C0C0')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC AUC');

Multiclass classification 

In [None]:
np.random.seed(1)
X_mc, y_mc = make_blobs(n_samples = 200, centers = 3, cluster_std = 0.5*noisiness, n_features = 2)

model = SVC(kernel = 'linear', C = 1, decision_function_shape = 'ovr')

model.fit(X_mc, y_mc)
y_mc_hat = model.predict(X_mc)

fig, axes = plt.subplots(1, 2, figsize = (15, 6))
axes[0].scatter(X_mc[:, 0], X_mc[:, 1], c = clrs[y_mc])

x_min, x_max = X_mc[:, 0].min() - 1, X_mc[:, 0].max() + 1
y_min, y_max = X_mc[:, 1].min() - 1, X_mc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[1].contourf(xx, yy, Z, alpha = 0.4)
axes[1].scatter(X_mc[:, 0], X_mc[:, 1], c = clrs[y_mc_hat])
axes[0].set_title('Original Data')
axes[1].set_title('Prediction');

Muticlass classification 

In [None]:
def max_cost(w, X, y):
    X_intercept = add_intercept(X)
    Xb = np.dot(X_intercept,w)
    return sum(np.maximum(0, -y*Xb))

print(max_cost(w, X, y))

In [None]:
def n_wrong(w, X = X, y = y):
    X_intercept = add_intercept(X)
    Xb = np.dot(X_intercept,w)
    return sum(np.maximum(0, np.sign(-y*Xb)))

print(n_wrong(w,X,y))

In [None]:
from scipy.optimize import minimize

result = minimize(n_wrong, w)

w_count = result.x
print(n_wrong(w_count))

Discussion: What are some differences between these two loss functions?
The max cost function tells how far the input is from the discrimination line, while the counting loss function only tells the number of misclassification.

Generalized Linear Models

Perceptron loss function

In [None]:
# max loss function 
def max_cost(w, X=X, y=y):
    X_intercept = add_intercept(X)
    Xb = np.dot(X_intercept,w)
    return sum(np.maximum(0, -y*Xb))

print(max_cost(w,X,y))

In [None]:
#perceptron 
from scipy.optimize import minimize

result = minimize(max_cost, w)
w_perceptron = result.x
result

In [None]:
# plot perceptron

prediction = linear_classifier(X, w_perceptron)

fig, axes = plt.subplots(1, 2, figsize = (15, 6))
axes[0].scatter(X[:, 0], X[:, 1], c = clrs[y_blob + 1])
axes[1].scatter(X[:, 0], X[:, 1], c = clrs[prediction + 1])

#plot line
m = -w_perceptron[1] / w_perceptron[2]
b = -w_perceptron[0] / w_perceptron[2]
axes[1].plot(X[:, 0], m*X[:, 0] + b, ls = '-')

axes[0].set_title('Original Data')
axes[1].set_title('Prediction');

Logistic regression
 implementation of softmax 

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict(X[:2, :])
array([0, 0])

clf.predict_proba(X[:2, :])
#array([[9.8...e-01, 1.8...e-02, 1.4...e-08],
 #      [9.7...e-01, 2.8...e-02, ...e-08]])
clf.score(X, y)

In [None]:
def softmax_cost(w, X = X, y = y):
    X_intercept = add_intercept(X)
    Xb = np.dot(X_intercept, w)
    exp_yXb = np.exp(-y * Xb)
    return sum(np.log(1 + exp_yXb))

print(softmax_cost(w, X, y))

In [None]:
from scipy.optimize import minimize

result = minimize(softmax_cost, w, args = (X, y))
w_logit = result.x

prediction = linear_classifier(X, w_logit)

#plot line
m = -w_logit[1] / w_logit[2]
b = -w_logit[0] / w_logit[2]
axes[1].plot(X[:, 0], m*X[:, 0] + b, ls = '-')

Support Vector Machine

In [None]:
def regularized_cost(w, X = X, y = y, alpha = 1):
    X_intercept = add_intercept(X)
    Xb = np.dot(X_intercept, w)
    cost = sum(np.maximum(0, 1 - y*Xb))
    cost += alpha*np.linalg.norm(w[1:], 2)
    return cost

from scipy.optimize import minimize

w_guess = np.array([-10, -4, -10])
result = minimize(regularized_cost, w_guess, args = (X, y, 1))
w_svm = result.x

prediction = linear_classifier(X, w_svm)
#plot line
m = -w_svm[1] / w_svm[2]
b = -w_svm[0] / w_svm[2]

Non-linearity and Kernels¶

In [None]:
# transform X to get better classification result at high D 
X_new = np.exp(-(X[:, 0]**2 + X[:, 1]**2))
X_new = X_new.reshape(-1, 1)
X_nonlinear = np.append(X, X_new, 1)

result = minimize(regularized_cost, w_guess, args = (X_nonlinear, y, 1))
w_svm = result.x

prediction = linear_classifier(X_nonlinear, w_svm)

In [None]:
# kernel transformation of feature X to get better classification results 
from sklearn.metrics.pairwise import rbf_kernel

X_kernel = rbf_kernel(X, X, gamma=1)
print(X_kernel.shape)

w_guess = np.zeros(X.shape[0] + 1)

result = minimize(regularized_cost, w_guess, args=(X_kernel, y, 1))
w_svm = result.x

prediction = linear_classifier(X_kernel, w_svm)

Support vector classfier 

In [None]:
from sklearn.svm import SVC # "Support vector classifier"

model = SVC(kernel = 'rbf', gamma = 1, C = 1000)
model.fit(X, y)
y_predict = model.predict(X)

fig, axes = plt.subplots(1, 2, figsize = (15, 6))
axes[0].scatter(X[:, 0], X[:, 1], c = clrs[y])
axes[1].scatter(X[:, 0], X[:, 1], c = clrs[y_predict]);

In [None]:
model = SVC(kernel = 'rbf', gamma = 1, C = Ci)
model.fit(X, y)
y_predict = model.predict(X)
plot_svc_decision_function(model, ax)

k-nearest Neighbors 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 20)
knn.fit(X, y)
y_predict = knn.predict(X)


#plot the boundries 
fig, axes = plt.subplots(1, 2, figsize = (15, 6))
axes[0].scatter(X[:, 0], X[:, 1], c = clrs[y])

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[1].contourf(xx, yy, Z, alpha = 0.4)
axes[1].scatter(X[:, 0], X[:, 1], c = clrs[y_predict])
axes[0].set_title('Original Data')
axes[1].set_title('kNN Prediction (k = 20)');


Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import GaussianNB

NB = GaussianNB()
NB.fit(X, y)
y_predict = NB.predict(X)

fig, axes = plt.subplots(1, 2, figsize = (15, 6))
axes[0].scatter(X[:, 0], X[:, 1], c = clrs[y])

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

Z = NB.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[1].contourf(xx, yy, Z, alpha = 0.4)
axes[1].scatter(X[:, 0], X[:, 1], c = clrs[y_predict])
axes[0].set_title('Original Data')
axes[1].set_title('Naive Bayes Prediction');

Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(max_depth = 3)
dtree.fit(X_train,y_train)
y_predict = dtree.predict(X_train)

cm_train = confusion_matrix(y_train, y_predict)

y_predict = dtree.predict(X_test)
cm_test = confusion_matrix(y_test, y_predict)

fig, axes = plt.subplots(1, 2, figsize = (12, 6))
sns.heatmap(cm_train, annot = True, cbar = False, linewidth = .5, ax = axes[0], fmt = 'd')
sns.heatmap(cm_test, annot = True, cbar = False, linewidth = .5, ax = axes[1], fmt = 'd')

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

In [None]:
X = X_mc
y = y_mc

In [None]:
tree.fit(X, y)
y_tree = tree.predict(X)

#plot boundaries 
fig, axes = plt.subplots(1, 2, figsize = (15, 6))
axes[0].scatter(X[:, 0], X[:, 1], c = clrs[y])

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[1].contourf(xx, yy, Z, alpha = 0.4)
axes[1].scatter(X[:, 0], X[:, 1], c = clrs[y_tree])


bottom, top = axes[0].get_ylim()
axes[1].set_ylim(bottom, top)

left, right = axes[0].get_xlim()
axes[1].set_xlim(left, right)

axes[0].set_title('Original Data')
axes[1].set_title('Decision Tree Prediction');


In [None]:
#Visilization of the tree
from io import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
export_graphviz(tree, out_file = dot_data,  
                filled = True, rounded = True,
                special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

High dimensional classificaiton

Kernel-based models

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import accuracy_score, confusion_matrix

X_kernel = rbf_kernel(X_perov, X_perov, gamma = 0.02)

In [None]:
#train svm with original data without kernel 

w_guess = np.array([-10, -4, -10])
result = minimize(regularized_cost, w_guess, args = (X_perov[:, 3:5], y_perov, 1))
w_svm = result.x


prediction = linear_classifier(X_perov[:, 3:5], w_svm)
prediction = 2 * prediction - 1

# plot boundaries 

m = -w_svm[1] / w_svm[2]
b = -w_svm[0] / w_svm[2]
axes[1].plot(X_perov[:, 3], m * X_perov[:, 3] + b, ls = '-')

# accuracy evaluation
accuracy_score(y_perov, prediction)

In [None]:
# train svm with kernel 
w_guess = np.array([-10, -4, -10])
result = minimize(regularized_cost, w_guess, args = (X_kernel[:, 3:5], y_perov, 1))
w_svm = result.x
# plot/accuracy as shown above 

scikit-learn SVC

In [None]:
from sklearn.svm import SVC

model = SVC(kernel = 'rbf', gamma = 100, C = 1000)
model.fit(X_perov[:, 3:5], y_perov)
y_predict = model.predict(X_perov[:, 3:5])

print(model.score(X_perov[:, 3:5], y_perov))

svc with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle

X_train, y_train = shuffle(X_train, y_train) #Shuffle everything just for good measure

sigmas = np.array([1e-3, 1e-2, 1e-1, 1, 10, 100])
gammas = 1. / 2 / sigmas**2

alphas = np.array([1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1])
Cs = 1 / alphas

parameter_ranges = {'C': Cs, 'gamma': gammas}

svc = SVC(kernel = 'rbf')

svc_search = GridSearchCV(svc, parameter_ranges, cv = 3)
svc_search.fit(X_train, y_train)
svc_search.best_estimator_, svc_search.best_score_


best_svc = svc_search.best_estimator_

y_predict = best_svc.predict(X_test)

best_svc.score(X_test, y_test)

Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, y_predict)

fig, ax = plt.subplots(figsize = (7, 7))
sns.heatmap(cm, annot = True, linewidth = .45, cbar = False)

High Dimentsion data "summary statistics" (mean, standard deviation, min, max, etc.) of each feature. 

In [None]:
means = X_mnist.mean(axis=0)
print(means.shape)
means = means.reshape(1,-1) #convert to *row* vector
show_image(means, 0)
plt.title('Mean');

In [None]:
stdevs = X_mnist.std(axis=0).reshape(1, -1)
show_image(stdevs, 0)
plt.title('Standard Deviation');

In [None]:
means = X_dow.mean(axis = 0) # axis=0 means rows 
stds = X_dow.std(axis = 0)

data = pd.DataFrame(means, index = df.columns[1:-5], columns = ['mean'])
data['std'] = stds
data

Histogram plots (High Dimensional data)

In [None]:
N = X_mnist.shape[-1]
n = int(np.sqrt(N)) #n = 8 here
fig, axes = plt.subplots(n, n, figsize = (5 * n, 5 * n), dpi = 200)
ax_list = axes.ravel()
for i in range(N):
    ax_list[i].hist(X_mnist[:, i])
    ax_list[i].set_xlabel(i)

KRR

In [None]:
from sklearn.kernel_ridge import KernelRidge

krr = KernelRidge(kernel = 'rbf')
alphas = np.logspace(-4, -1, 4)
gammas = np.logspace(-6, -3, 4)
param_grid = {'alpha': alphas, 'gamma': gammas}

krr_search = GridSearchCV(krr, param_grid, cv = 3)
krr_search.fit(pls_train, y_train)

In [None]:
Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X, y_dow)
r2_LR = model.score(X, y_dow)

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
linreg.score(X_test, y_test)

linreg = LinearRegression()
linreg.fit(X_squared_train, y_train)
r2_train = linreg.score(X_squared_train, y_train)
r2_test = linreg.score(X_squared_test, y_test)

SVC model and GridSearchCV method

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_perov, y_perov, test_size = 0.4)

parameters = {"C": C_range, "gamma": gamma_range}

classifier = SVC(kernel = 'rbf')

clf = GridSearchCV(classifier, parameters)
clf.fit(X_train_regular,y_train)
print(clf.best_estimator_.score(X_test_regular, y_test))
svc_reg = clf.best_estimator_

Dimensional reduction 

The "stress" function compares the distance between points $i$ and $j$ in a low-dimensional space to the distance in the full-dimensional space:

$ S(\vec{x}_{0}, \vec{x}_1, \vec{x}_2, ... \vec{x}_n) =  \left( \frac{\sum_{i=0}^n \sum_{i<j}(d_{ij} - ||x_i - x_j||)^2}{\sum_{i=0}^n \sum_{i<j} d_{ij}^2} \right)^{1/2} $


In [None]:
from scipy.spatial.distance import pdist

def stress(X_reduced, X):
    D_red = pdist(X_reduced)
    D_tot = pdist(X)
    numerator = np.sum((D_tot - D_red)**2)
    denom = np.sum(D_tot**2)
    return np.sqrt(numerator / denom)

In [None]:
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import Isomap, TSNE

n_components = 2

pca = PCA(n_components = n_components)
%time X_pca = pca.fit_transform(X)

kpca = KernelPCA(n_components = n_components, kernel = 'rbf', gamma = 0.1)
%time X_kpca = kpca.fit_transform(X)

tsne = TSNE(n_components = n_components)
%time X_tsne = tsne.fit_transform(X)

PCA manual

In [None]:
# find the rank and eigenvaluse of the corvariance matrix
C = np.cov(X_mnist.T)
eig_vals, eig_vecs = np.linalg.eig(C)
eig_vecs = eig_vecs.T #<- note that the eigenvectors are the *columns* by default
print('Rank of the covariance matrix: {}'.format(np.linalg.matrix_rank(C)))
print(eig_vals)

sorted_idxs = np.argsort(eig_vals) #this gives us the list of indices from smallest to largest
sorted_idxs = list(sorted_idxs)
sorted_idxs.reverse() #this goes from largest to smallest
eig_vals = eig_vals[sorted_idxs] #re-sort values
eig_vecs = eig_vecs[sorted_idxs, :] #re-sort vectors

# project on k dimensions
k = 2
projector = eig_vecs[:k, :].T
X_k = np.dot(X_mnist, projector)
X_reconstructed = np.dot(projector, X_k.T).T


PCA

In [None]:
from sklearn.decomposition import PCA

k=9
pca_model = PCA(n_components = k)
pca_model.fit(X_mnist)
X_pca = pca_model.transform(X_mnist)a

PCA Kernel

In [None]:
from sklearn.decomposition import KernelPCA
k = 2
gamma = 10
lPCA = PCA(n_components = k)
kPCA = KernelPCA(n_components = k, kernel = 'rbf', gamma = gamma, fit_inverse_transform = True)

lPCA.fit(X_m)
X_PCA = lPCA.transform(X_m)

kPCA.fit(X_m)
X_kPCA = kPCA.transform(X_m)

# PCA invertion to high-d
X_PCA_reconstruct = lPCA.inverse_transform(X_PCA)
X_kPCA_reconstruct = kPCA.inverse_transform(X_kPCA)

Manifold learning
MDS

In [None]:
from sklearn.manifold import MDS

k = 2
mds = MDS(n_components = k, n_init = 1, max_iter = 100) #<- note that we need to give some max_iteration and initial guess parameters since this is iterative
X_mds = mds.fit_transform(X_mnist) #<- note that there is no transform method. What does this mean?

# the goal is to minimize the stress function 
stress(X_mds, X_mnist)

Manifold based tSNE

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components = 2, perplexity = 30.0, 
            early_exaggeration = 12.0, 
            learning_rate = 200.0, 
            n_iter = 1000,
            init = 'random',
            method = 'exact')

X_tsne = tsne.fit_transform(X_mnist)

stress(X_tsne, X_mnist)

Autoencoding

Clustering: unsupervised algorithms

Clustering algorithms seek to identify data points that are similar to each other based on a set of descriptive features.

k-means (clustering)

In [None]:
from sklearn.cluster import KMeans

n_clusters = 3
random_state = 20
X = X_pca #scikit-learn is much more efficient, so we can run it on the whole datset

model = KMeans(n_clusters = n_clusters)#, random_state=random_state)
model.fit(X)
y_predict = model.predict(X)
centers = model.cluster_centers_

Gaussian Mixture Models


In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score

X_i = X_pca
n_clusters = 10

model = GaussianMixture(n_components = n_clusters, random_state = random_state, covariance_type = covariance_type)
model.fit(X_i)
y_predict = model.predict(X_i)

centers = model.means_

silhouette = silhouette_score(X_i, y_predict)
c_h_score = calinski_harabasz_score(X_i, y_predict)
BIC=model.bic(X_i)

print(silhouette)
print(c_h_score)

In [None]:
from sklearn.mixture import GaussianMixture

n_clusters = 2
random_state = 0
covariance_type = 'full' #full, tied, spherical

fig, axes = plt.subplots(1, 3, figsize = (15, 4))

for X_i, label, ax in zip(data, labels, axes):
    model = GaussianMixture(n_components = n_clusters, random_state = random_state, covariance_type = covariance_type)
    model.fit(X_i)
    y_predict = model.predict(X_i)
    centers = model.means_

Density-based models

Density-based clustering algorithms consider local density of points and utilize this information to group points into clusters.

In [None]:
from sklearn.cluster import MeanShift

model = MeanShift(bandwidth = 21)
%time model.fit(X_tsne)
labels = model.labels_
centroids = model.cluster_centers_

DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

X = X_pca

model = DBSCAN(eps = 1, min_samples = 3)
y_predict = model.fit_predict(X)


Hierarchical models:These models construct linkages between different points and use distance cutoffs to assign clusters

Dendrogram

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

X = X_pca

Z = linkage(X, method='single')

"cophenetic coefficient" measures the ratio of the distance in "linkage" space to the distance in the high-dimensional space.

In [None]:
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

Dij = pdist(X, metric = 'euclidean')
for method in ['single', 'complete', 'average', 'weighted', 'centroid', 'ward']:
    Z = linkage(X, method = method)
    C, coph_dists = cophenet(Z, Dij)
    print('cophenetic coefficient of {}: {}'.format(method, C))

dendrogram function is a visual representation of this "linkage" structure.

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (15, 6))
Z = linkage(X, method = 'centroid')
dendrogram(Z, color_threshold = 20, ax = axes[1])
axes[0].scatter(X[:, 0], X[:, 1])
axes[0].set_title('PCA Data')
axes[1].set_title('Dendrogram');

Agglomerative hierarchical clustering

In [None]:
from scipy.cluster.hierarchy import fcluster

max_d = 20
k = 4
Z = linkage(X, method = 'centroid')

clusters_dist = fcluster(Z, max_d, criterion = 'distance')
clusters_k = fcluster(Z, k, criterion = 'maxclust')

fig, axes = plt.subplots(1, 3, figsize = (18, 6))
dendrogram(Z, color_threshold = max_d, truncate_mode = 'lastp', p = k, ax = axes[0])
axes[1].scatter(X[:, 0], X[:, 1], c = clrs[clusters_dist])
axes[2].scatter(X[:, 0], X[:, 1], c = clrs[clusters_k])

axes[0].set_title('Truncated Dendrogram')
axes[1].set_title("Agglomerative Clustering w/ criterion = 'distance'")
axes[2].set_title("Agglomerative Clustering w/ criterion = 'maxclust'");

In [None]:
I_cutoff = 9
clusters_I = fcluster(Z, I_cutoff, criterion = 'inconsistent', depth = 10)
n_clusters = max(clusters_I)

fig, axes = plt.subplots(2, 2, figsize = (12, 10))
axes = axes.ravel()
dendrogram(Z, color_threshold = 3, truncate_mode = 'lastp', p = int(n_clusters), ax = axes[0])
axes[1].scatter(X[:, 0], X[:, 1], c = clrs[clusters_I])
axes[2].scatter(X[:, 0], X[:, 1], c = clrs[clusters_dist])
axes[3].scatter(X[:, 0], X[:, 1], c = clrs[clusters_k])

axes[0].set_title('Truncated Dendrogram')
axes[1].set_title("Agglomerative Clustering w/ criterion = 'inconsistent'");
axes[2].set_title("Agglomerative Clustering w/ criterion = 'distance'")
axes[3].set_title("Agglomerative Clustering w/ criterion = 'maxclust'")

print('Number of clusters:', n_clusters)

Generative Model：Generative models describe the probability distribution of the underlying data. They can be used to explore datasets in many different ways, and are unsupervised since they do not require labels for the output data.

In [None]:
import pandas as pd
from scipy.stats import norm

mu = 0
variance = 1
sigma = np.sqrt(variance)
x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
gauss = norm.pdf(x, mu, sigma)
X_new = norm.rvs(mu, sigma, size = 100000)


In [None]:
feature = 6
x_1d = X_dow[:, feature]

mu = x_1d.mean()
std = x_1d.std()

x_synthetic = norm.rvs(mu, std, size = 1000)

In [None]:
feature_A = 6
feature_B = 4
X_2d = X_dow[:, [feature_A, feature_B]]


from sklearn.mixture import GaussianMixture

N_clusters = 2
gmm = GaussianMixture(n_components = N_clusters, covariance_type = 'full', random_state = 0)
gmm.fit(X_2d)
y_2d = gmm.predict(X_2d)
bic = gmm_n.bic(X_2d)


In [None]:
n_components = np.arange(2, 50)[::3]

BICs = []
for n in n_components:
    gmm_n =  GaussianMixture(n, covariance_type = 'full').fit(X_2d)
    bic = gmm_n.bic(X_2d)
    BICs.append(bic)
    models.append(gmm_n)
    
min_idx = BICs.index(min(BICs))
gmm_best = models[min_idx]
example = gmm_best.sample()
show_image(example, 0)

In [None]:
X_mnist_6 = X_mnist[y_mnist == 6]

# Let's just use an arbitrary model
gmm_n =  GaussianMixture(5, covariance_type = 'spherical').fit(X_mnist_6)
example = gmm_n.sample()
show_image(example, 0)

Kernel Density Estimation

In [None]:
from sklearn.neighbors import KernelDensity

# instantiate and fit the KDE model
x_1d = x_1d.reshape(-1, 1)
kde = KernelDensity(bandwidth = 0.15, kernel = 'gaussian')
kde.fit(x_1d)

#create a continuous x variable
x_continuous = np.linspace(min(x_1d), max(x_1d), 1000)

# score_samples returns the log of the probability density
logprob = kde.score_samples(x_continuous)

In [None]:
X_synthetic = kde.sample(10000)
X_synthetic = X_synthetic.reshape(-1, 1)
kde = KernelDensity(bandwidth = 0.15, kernel = 'gaussian')
kde.fit(X_synthetic)

Not-so-navie bayes

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB

In [None]:
label = 0
X = X_mnist[y_mnist == label]

model = KernelDensity(bandwidth = 10, kernel = 'gaussian')
model.fit(X);

model.score_samples(X_mnist[:3, :])

X_train, X_test, y_train, y_test = train_test_split(X_mnist, y_mnist, test_size = 0.3, random_state = 1)
prediction = not_so_naive(X_train, X_test, y_train, model)

accuracy_score(y_test, prediction)
cm = confusion_matrix(y_test.reshape(-1,), prediction)
df_cm = pd.DataFrame(cm, index = range(0, 10), columns = range(0, 10))

Simple Gaussian Naive Bayes

In [None]:
NB = GaussianNB()
yhat = NB.fit(X_train, y_train).predict(X_test)

NB.score(X_test, y_test)

cm = confusion_matrix(y_test, yhat)
df_cm = pd.DataFrame(cm, index = range(10), columns = range(10))
sns.heatmap(df_cm, annot = True);

PLS (Feature Transformation)

In [None]:
#Create a KRR model based on the first 5 PLS components
X_train, X_test, y_train, y_test = train_test_split(X, y_dow, test_size = 0.3)

pls = PLSRegression(n_components = 5)
pls.fit(X_train, y_train)
pls_train = pls.transform(X_train)
pls_test = pls.transform(X_test)

from sklearn.kernel_ridge import KernelRidge

krr = KernelRidge(kernel = 'rbf')
alphas = np.logspace(-4, -1, 4)
gammas = np.logspace(-6, -3, 4)
param_grid = {'alpha': alphas, 'gamma': gammas}

krr_search = GridSearchCV(krr, param_grid, cv = 3)
krr_search.fit(pls_train, y_train)
krr_search.best_estimator_.score(pls_test, y_test)

In [None]:
from sklearn.cross_decomposition import PLSRegression

r2s_PLS = []
m_PLS = range(1, X.shape[1]) #PLS does not allow more components than original features
for m in m_PLS:
    model = PLSRegression(n_components = m)
    model.fit(X, y_dow)
    r2 = model.score(X, y_dow)
    r2s_PLS.append(r2)

In [None]:
from sklearn.datasets.samples_generator import make_blobs
X_blobs, y_blobs = make_blobs(n_samples = 50, centers = 2, cluster_std = 0.5, n_features = 2, random_state = 0)


Load MNIST data

In [None]:
from sklearn.datasets import load_digits

digits = load_digits()
print("Digits data shape: {}".format(digits.data.shape))
print("Digits output shape: {}".format(digits.target.shape))
X_mnist = np.array(digits.data)
y_mnist = np.array(digits.target)

Linear discriminator analysis LDA (feature transformation)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_mnist, y_mnist)
X_LDA = lda.transform(X_mnist)

print(X_LDA.shape)

W_lda = lda.scalings_ # extract weights or transform the X matrix directly with scikit learn
X_lda = linear_combination(X_mnist, W_lda)
print(X_lda.shape)

score = lda.score(X_test, y_test)
print(score)
X_test_LDA = lda.transform(X_test)
X_train_LDA = lda.transform(X_train)

PCA (feature transformation)

In [None]:
from sklearn.decomposition import PCA

k=9
pca_model = PCA(n_components = k)
pca_model.fit(X_mnist)
X_pca = pca_model.transform(X_mnist)

Confusion Matrix 

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

y_pred = lda.predict(X_test)
CM = confusion_matrix(y_test, y_pred)

df_cm = pd.DataFrame(CM, index = range(0, 10), columns = range(0, 10))
sns.heatmap(df_cm, annot=True);

LASSO

In [None]:
from sklearn.linear_model import Lasso

model = Lasso(alpha = 1.0)
model.fit(X_squared_train, y_train)
r2_train = model.score(X_squared_train, y_train)
r2_test = model.score(X_squared_test, y_test)

coeffs = model.coef_
print("Total Number of Coefficients: {}".format(len(coeffs)))
nonzero_coeffs = [c for c in coeffs if abs(c) > 0]
print("Number of Non-Zero Coefficients: {}".format(len(nonzero_coeffs)))

Autofeat and symbolic Regression

In [None]:
# ! pip install autofeat

In [None]:
from autofeat import AutoFeatRegressor
transforms = ['1/', 'exp', 'abs', 'sqrt', '^2', '^3']
afreg = AutoFeatRegressor(verbose = 1, feateng_steps = 2, featsel_runs = 1, transformations = transforms)
afreg.fit(X_train, y_train)

afreg.new_feat_cols_
afreg.good_cols_   # all of the features that were selected by the feature selection algorithm

afreg.score(X_test, y_test)

X_afreg_train = afreg.transform(X_train) # creat new feature matrix contains all new features
X_afreg_test = afreg.transform(X_test)

linreg.fit(X_afreg_train, y_train)
linreg.score(X_afreg_test, y_test)

In [None]:
dow_input_names = dow_feature_names[:X_train.shape[1]]
X_train_df = pd.DataFrame(X_train, columns = dow_input_names)
X_test_df = pd.DataFrame(X_test, columns = dow_input_names)

transforms = ["1/", "exp", "log", "abs", "sqrt", "^2", "^3", "1+", "1-", "exp-"]
afreg = AutoFeatRegressor(verbose = 1, feateng_steps = 2, featsel_runs = 1, transformations = transforms, units = unit_dict)
afreg.fit(X_train_df, y_train)

afreg.score(X_test_df, y_test)

afreg.new_feat_cols_ # features generated
afreg.good_cols_ # features that have been selected

time series 
statsmodels packages

In [None]:
#! conda install statsmodels

In [None]:
import statsmodels.api as api

sm_data = api.datasets.co2.load_pandas()
co2_df = sm_data.data
co2_df.plot();

Handling missing data in time series analysis:forward fill/back fill/linear interp/spline interp/linear interp
fillna and interpolate

In [None]:
interp_df = co2_df
interp_df['forward_fill'] = interp_df['co2'].fillna(method='ffill')
interp_df['back_fill'] = interp_df['co2'].fillna(method='bfill')
interp_df['linear_interp'] = interp_df['co2'].interpolate(method='linear')
interp_df['spline_interp'] = interp_df['co2'].interpolate(method='spline', order=3)

interp_df.plot();

co2_df = interp_df[['co2', 'linear_interp']]
co2_df = co2_df.rename(columns = {'linear_interp': 'co2_interp'})
co2_df.head(10)

Moving Average Smoothing 

In [None]:
import panda as pd 
window = 10

rolling_df = co2_df['co2_interp'].rolling(window)

rolling_df

moving_avg = rolling_df.mean()
moving_avg.plot();

moving_stdev = rolling_df.std()
moving_stdev.plot();

In [None]:
rolling_dow_df = dow_df.rolling(24)
moving_avg_dow = rolling_dow_df.mean()
moving_avg_dow.plot();

Autocorrelation

In [None]:
from numpy import polyfit
from scipy.stats import pearsonr

lag = 20
dataset = co2_df['co2_interp']

xs = []
ys = []
for i in range(len(dataset)):
    if i >= lag:
        x_i = dataset[i]
        x_lag = dataset[i - lag]
        xs.append(x_lag)
        ys.append(x_i)

In [None]:
#compute the autocorrelation using the `statsmodels` package:
from statsmodels.tsa.stattools import acf

autocorr = acf(dataset, nlags = 40)
print(len(autocorr))
print(autocorr)

In [None]:
#Note the _ = syntax is just used to avoid displaying the same plot twice (a small issue with how statsmodels works in Jupyter notebooks).:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

_ = plot_acf(dataset, lags = 100)

In [None]:
#partial autocorrelation
_ = plot_pacf(dataset, lags = 100)

In [None]:
_ = plot_acf(dow_df, lags = 100)
_ = plot_pacf(dow_df, lags = 100)

Stationary data
Statistical test: a test for determining if a dataset is stationary

In [None]:
from statsmodels.tsa.stattools import adfuller
p_val = adfuller(co2_df['co2_interp'])[1]
print("Probability the data is stationary: {}".format(1 - p_val))

Differencing 
Dickey-Fuller test to determine if data is stationary. 

In [None]:
co2_df['co2_diff'] = co2_df['co2_interp'] - co2_df['co2_interp'].shift(1)
co2_df['co2_diff'].plot();

p_val = adfuller(co2_df['co2_diff'][1:])[1]
print("Probability the data is stationary: {}".format(1 - p_val))

_ = plot_acf(co2_df['co2_diff'][1:], lags = 100)

_ = plot_pacf(co2_df['co2_diff'][1:], lags = 52)

In [None]:
from statsmodels.tsa.stattools import adfuller

dow_df['diff'] = dow_df['y:Impurity'] - dow_df['y:Impurity'].shift(1)

p_val = adfuller(dow_df['diff'][1:])[1]
print("Probability the data is stationary: {}".format(1 - p_val))

Model fitting to remove seasonality

In [None]:
y = co2_df['co2_interp']
weeks = np.arange(0, len(y))
m, b = np.polyfit(weeks, y, deg = 1)
yhat_linear = m*weeks + b
resid = y - yhat_linear

In [None]:
#Here we will use LASSO to identify the most highly-correlated frequencies/offsets:
from sklearn.linear_model import Lasso

model = Lasso(alpha = 0.1)
model.fit(X, y)
yhat = model.predict(X)
model_resid = y - yhat

In [None]:
# check to see if data is stationary 
p_val = adfuller(model_resid)[1]
print("Probability the data is stationary: {}".format(1 - p_val))

_ = plot_acf(model_resid, lags = 104)
_ = plot_pacf(model_resid, lags = 104)

In [None]:
#apply differecing in addiiton to model fitting to remove these correlations

In [None]:
model_resid_diff = model_resid - model_resid.shift(1)
model_resid_diff = model_resid_diff[1:] #remove NaN from 0th position

p_val = adfuller(model_resid_diff)[1]
print("Probability the data is stationary: {}".format(1 - p_val))

 ARIMA Modeling :combine the auto-regressive models with two additional terms:Integration and Moving Average

In [None]:
# load data 
df_model = co2_df.copy()

del df_model['co2']
del df_model['co2_diff']

# make test/train split
train = df_model[:N_train]
test = df_model[N_train:N_train + N_test]

# get d,p,q value d= probability of differencing; p= partial autocorrelation peaks above 0,
#q= autocorrelation peaks above 0
diffed = co2_df['co2_interp'] - co2_df['co2_interp'].shift(1)
diffed = diffed[1:]

p_val = adfuller(diffed)[1]
print("Probability the data is stationary after 1 difference: {}".format(1 - p_val))

_ = plot_acf(diffed)
_ = plot_pacf(diffed)

In [None]:
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(train, order=(4, 1, 4)) # order p,d,q
model_fit = model.fit(disp = 0)
print(model_fit.summary())

model_fit.plot_predict(dynamic = False)
print(np.mean(np.abs(model_fit.resid)))
plt.show()

# Forecast
fc, se, conf = model_fit.forecast(N_test, alpha = 0.05)  # 95% conf

# Make as pandas series
fc_series = pd.Series(fc, index = test.index)
lower_series = pd.Series(conf[:, 0], index = test.index)
upper_series = pd.Series(conf[:, 1], index = test.index)

# Plot
plt.plot(train, label = 'Training Data')
plt.plot(test, label = 'Test Data')
plt.plot(fc_series, label = 'Forecast')
plt.fill_between(lower_series.index, lower_series, upper_series, alpha=.15)
plt.title('Forecast vs Actual Data')
plt.legend(loc='upper left', fontsize = 8);