# Machine Learning Best Practices
### ML Solutions workflow: 
##### 1). Data Preparation 
##### 2). Training Set Generation 
##### 3). Model Training, Evaluation, and Selection 
##### 4). Deployment and Monitoring
  

### Dealing with Missing Data

In [27]:
import numpy as np
from sklearn.impute import SimpleImputer

### Represent unknown values with np.nan in Numpy

In [28]:
data_origin = [[30, 100],
               [20, 50],
               [35, np.nan],
               [25, 80],
               [30, 70],
               [40, 60]]

### Initialize the impuation transformer with the mean value, obtaining the mean value from the original data

In [29]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(data_origin)

#### Complete the missing values

In [30]:
data_mean_imp = imp_mean.transform(data_origin)
print(data_mean_imp)


[[ 30. 100.]
 [ 20.  50.]
 [ 35.  72.]
 [ 25.  80.]
 [ 30.  70.]
 [ 40.  60.]]


### Initialize the imputation transformer in same manner with the median value

In [31]:
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(data_origin)
data_median_imp = imp_median.transform(data_origin)
print(data_median_imp)

[[ 30. 100.]
 [ 20.  50.]
 [ 35.  70.]
 [ 25.  80.]
 [ 30.  70.]
 [ 40.  60.]]


#### When new samples come in, the missing values (in any attribute) can be imputed using the trained transformer 
####  (here with the mean value)

In [32]:
new = [[20, np.nan],
       [30, np.nan],
       [np.nan, 70],
       [np.nan, np.nan]]
new_mean_imp = imp_mean.transform(new)
print(new_mean_imp)

[[20. 72.]
 [30. 72.]
 [30. 70.]
 [30. 72.]]


### Exploring how imputing missing values and discarding missing data effects the prediciton results:

#### Importing the Diabetes Dataset

In [33]:
from sklearn import datasets
dataset = datasets.load_diabetes()
X_full, y =dataset.data, dataset.target

### Simulating a corrupted dataset by adding 25% missing values

In [34]:
m, n = X_full.shape
m_missing = int(m * 0.25)
print(m, m_missing)

442 110


### Randomly select the m_missing samples:

In [35]:
np.random.seed(42)
missing_samples = np.array([True]  * m_missing + [False] * (m - m_missing))
np.random.shuffle(missing_samples)

### Randomly select 1 out of n features for each missing sample

In [36]:
missing_features = np.random.randint(low=0, high=n, size=m_missing)

### Represent missing values wth nan: 

In [37]:
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = np.nan


### Discarding the samples with a missing value to handle the corrupted dataset

In [38]:
X_rm_missing = X_missing[~missing_samples, :]
y_rm_missing = y[~missing_samples]

### Estimate R^2 on the data set with the missing samples removed

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100)
score_rm_missing = cross_val_score(regressor, X_rm_missing, y_rm_missing).mean()
print(f'Score with the data set with missing samples removed: {score_rm_missing:.2f}')

Score with the data set with missing samples removed: 0.38


#### Input missing values into the dataset with the mean:

In [40]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_mean_imp = imp_mean.fit_transform(X_missing)

#### Measure the effects of using this strategy by estimating the avg. R^2:

In [41]:
regressor = RandomForestRegressor(random_state=42,
                                  max_depth=10,
                                  n_estimators=100)
score_mean_imp = cross_val_score(regressor, X_mean_imp, y).mean()
print(f'Score with the data set with missing values replaced by the mean: {score_mean_imp:.2f}')

Score with the data set with missing values replaced by the mean: 0.41


In [42]:
regressor = RandomForestRegressor(random_state=42,
                                  max_depth=10, 
                                  n_estimators=500)
score_full = cross_val_score(regressor, X_full, y).mean()
print(f'Score with the full data set: {score_full:.2f}')

Score with the full data set: 0.42


### In this instance little information is compromised in the imputed dataset, however there is no gaurentee that
### this strategy always works better Sometimes dropping samples with missing values can be more effective.

In [43]:
from sklearn.datasets import load_digits
dataset = load_digits() 
X, y = dataset.data, dataset.target
print(X.shape)

(1797, 64)


### Estimate the accuracy of the original dataset

In [44]:
from sklearn.svm import SVC 

In [45]:
from sklearn.model_selection import cross_val_score 
classifier = SVC(gamma=0.005, random_state=42)
score = cross_val_score(classifier, X, y).mean() 
print(f'Score with the original data set: {score:.2f}')

Score with the original data set: 0.90


In [46]:
# Feature selection with random forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1, random_state=42)
random_forest.fit(X, y)

# Sort features based on their importancies
feature_sorted = np.argsort(random_forest.feature_importances_)

In [47]:
# Select different number of top features
K = [10, 15, 25, 35, 45]
for k in K:
    top_K_features = feature_sorted[-k:]
    X_k_selected = X[:, top_K_features]
    # Estimate accuracy on the data set with k selected features
    classifier = SVC(gamma=0.005)
    score_k_features = cross_val_score(classifier, X_k_selected, y).mean()
    print(f'Score with the dataset of top {k} features: {score_k_features:.2f}')



Score with the dataset of top 10 features: 0.86
Score with the dataset of top 15 features: 0.92
Score with the dataset of top 25 features: 0.95
Score with the dataset of top 35 features: 0.93
Score with the dataset of top 45 features: 0.90


## Best practice- Deciding whether to reduce dimensionality, and if so , how:

In [48]:
from sklearn.decomposition import PCA

# Keep different number of top components
N = [10, 15, 25, 35, 45]
for n in N:
    pca = PCA(n_components=n)
    X_n_kept = pca.fit_transform(X)
    # Estimate accuracy on the data set with top n components
    classifier = SVC(gamma=0.005)
    score_n_components = cross_val_score(classifier, X_n_kept, y).mean()
    print(f'Score with the dataset of top {n} components: {score_n_components:.2f}')



Score with the dataset of top 10 components: 0.94
Score with the dataset of top 15 components: 0.95
Score with the dataset of top 25 components: 0.93
Score with the dataset of top 35 components: 0.91
Score with the dataset of top 45 components: 0.90


## Performing Feature Engineering without Domain Expertise

In [49]:
from sklearn.preprocessing import Binarizer
X = [[4], [1], [3], [0]]
binarizer = Binarizer(threshold=2.9)
X_new = binarizer.fit_transform(X)
print(X_new)

[[1]
 [0]
 [1]
 [0]]


### Polynomial Transformation

In [50]:
from sklearn.preprocessing import PolynomialFeatures
X = [[2, 4],
     [1, 3],
     [3, 2],
     [0, 3]]
poly = PolynomialFeatures(degree=2)
X_new = poly.fit_transform(X)
print(X_new)



[[ 1.  2.  4.  4.  8. 16.]
 [ 1.  1.  3.  1.  3.  9.]
 [ 1.  3.  2.  9.  6.  4.]
 [ 1.  0.  3.  0.  0.  9.]]


### Word2Vec Embedding 
#### Create and train the Word2Vec Model, and then access the word vectors

In [54]:
from gensim.models import Word2Vec

In [55]:
sentences = [
    ["i", "love" "machine", "learning", "by", "example"],
    ["machine", "learning", "and", "deep", "learning", "are", "fascinating"],
    ["word", "embedding", "is", "essential", "for", "many", "nlp", "tasks"],
    ["word2vec", "produces", "word", "embeddings"]
]

# Create and train Word2Vec model
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=0)

# Access word vectors
vector = model.wv["machine"]
print("Vector for 'machine':", vector)

Vector for 'machine': [ 8.13227147e-03 -4.45733406e-03 -1.06835726e-03  1.00636482e-03
 -1.91113955e-04  1.14817743e-03  6.11386076e-03 -2.02715401e-05
 -3.24596534e-03 -1.51072862e-03  5.89729892e-03  1.51410222e-03
 -7.24261976e-04  9.33324732e-03 -4.92128357e-03 -8.38409644e-04
  9.17541143e-03  6.74942741e-03  1.50285603e-03 -8.88256077e-03
  1.14874600e-03 -2.28825561e-03  9.36823711e-03  1.20992784e-03
  1.49006362e-03  2.40640994e-03 -1.83600665e-03 -4.99963388e-03
  2.32429506e-04 -2.01418041e-03  6.60093315e-03  8.94012302e-03
 -6.74754381e-04  2.97701475e-03 -6.10765442e-03  1.69932481e-03
 -6.92623248e-03 -8.69402662e-03 -5.90020278e-03 -8.95647518e-03
  7.27759488e-03 -5.77203138e-03  8.27635173e-03 -7.24354526e-03
  3.42167495e-03  9.67499893e-03 -7.78544787e-03 -9.94505733e-03
 -4.32914635e-03 -2.68313056e-03 -2.71289347e-04 -8.83155130e-03
 -8.61755759e-03  2.80021061e-03 -8.20640661e-03 -9.06933658e-03
 -2.34046578e-03 -8.63180775e-03 -7.05664977e-03 -8.40115082e-03
 -3

## Embedding Layers in custom Neural Networks

In [58]:
import torch 
import torch.nn as nn 
input_data = torch.LongTensor([[1, 2, 3, 4], [5, 1, 6, 3]]) 
# Define the  embedding layer 
vocab_size = 10  # The total number of unique words 
embedding_dim = 3 # Dimensionality of the embedding s
embedding_layer = nn.Embedding(vocab_size, embedding_dim) 

# Pass inoput data through the embedding layer 
embedded_data = embedding_layer(input_data) 

# Print the embedded data 
print("Embedded Data:\n", embedded_data)


Embedded Data:
 tensor([[[ 0.1383,  0.3442, -1.5209],
         [-1.5797, -2.4281,  0.3292],
         [-1.6865,  0.4060, -0.1273],
         [ 1.2118,  0.4645,  0.1170]],

        [[ 2.2486,  0.8610, -1.7351],
         [ 0.1383,  0.3442, -1.5209],
         [ 1.8524, -0.6934,  0.2976],
         [-1.6865,  0.4060, -0.1273]]], grad_fn=<EmbeddingBackward0>)


# Best Practices in the Deployment and Monitoring Stage 
## Best Practice 19: Saving, Loading, and Reusing Models 
#### Saving and Restoring Models using Pickle

In [60]:
from sklearn import datasets
dataset = datasets.load_diabetes()
X, y = dataset.data, dataset.target

num_new = 30    # the last 30 samples as new data set
X_train = X[:-num_new, :]
y_train = y[:-num_new]
X_new = X[-num_new:, :]
y_new = y[-num_new:]

In [61]:
# Data pre-processing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)



In [62]:
import pickle 
# Save the scalar 
pickle.dump(scaler, open("scaler.p", "wb"))

In [63]:
X_scaled_train = scaler.transform(X_train)

In [64]:
# Regression model training
from sklearn.svm import SVR
regressor = SVR(C=20)
regressor.fit(X_scaled_train, y_train)

In [65]:
# Save the regressor
pickle.dump(regressor, open("regressor.p", "wb"))



In [66]:
# Deployment
my_scaler = pickle.load(open("scaler.p", "rb" ))
my_regressor = pickle.load(open("regressor.p", "rb"))

In [67]:
X_scaled_new = my_scaler.transform(X_new)
predictions = my_regressor.predict(X_scaled_new)

In [68]:
# Monitor
from sklearn.metrics import r2_score
print(f'Health check on the model, R^2: {r2_score(y_new, predictions):.3f}')



Health check on the model, R^2: 0.613


## Saving and Restoring Models in TensorFlow

In [69]:
import tensorflow as tf
from tensorflow import keras

cancer_data = datasets.load_breast_cancer()
X = cancer_data.data
X = scaler.fit_transform(X)
y = cancer_data.target



In [70]:
learning_rate = 0.005
n_iter = 10

tf.random.set_seed(42)

model = keras.Sequential([
    keras.layers.Dense(units=1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate))

In [71]:
model.fit(X, y, epochs=n_iter)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e957a6bd60>

In [72]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 31        
                                                                 
Total params: 31
Trainable params: 31
Non-trainable params: 0
_________________________________________________________________


In [73]:
path = './model_tf'
model.save(path )

INFO:tensorflow:Assets written to: ./model_tf\assets


In [74]:
new_model = tf.keras.models.load_model(path)
new_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 31        
                                                                 
Total params: 31
Trainable params: 31
Non-trainable params: 0
_________________________________________________________________


### Saving and Restoring Models in PyTorch

In [75]:
X_torch = torch.FloatTensor(X)
y_torch = torch.FloatTensor(y.reshape(y.shape[0], 1))

In [76]:
torch.manual_seed(42)
 
model = nn.Sequential(nn.Linear(X.shape[1], 1),
                      nn.Sigmoid())
 
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [77]:
def train_step(model, X_train, y_train, loss_function, optimizer):
    pred_train = model(X_train)
    loss = loss_function(pred_train, y_train)
    model.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()


for epoch in range(n_iter):
    loss = train_step(model, X_torch, y_torch, loss_function, optimizer)
    print(f"Epoch {epoch} - loss: {loss}")

Epoch 0 - loss: 0.8387020826339722
Epoch 1 - loss: 0.7999904751777649
Epoch 2 - loss: 0.76298588514328
Epoch 3 - loss: 0.7277476787567139
Epoch 4 - loss: 0.6943162679672241
Epoch 5 - loss: 0.662708044052124
Epoch 6 - loss: 0.6329135298728943
Epoch 7 - loss: 0.6048969030380249
Epoch 8 - loss: 0.5786024332046509
Epoch 9 - loss: 0.5539639592170715


In [78]:
print(model)

Sequential(
  (0): Linear(in_features=30, out_features=1, bias=True)
  (1): Sigmoid()
)


In [79]:
path = './model_pth' 
torch.save(model, path)

In [80]:
new_model = torch.load(path) 
print(new_model)

Sequential(
  (0): Linear(in_features=30, out_features=1, bias=True)
  (1): Sigmoid()
)
