<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Split-a-dataset-using-random-sampling" data-toc-modified-id="Split-a-dataset-using-random-sampling-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Split a dataset using random sampling</a></span></li><li><span><a href="#Split-a-dataset-using-random-sampling-and-hashing-technique" data-toc-modified-id="Split-a-dataset-using-random-sampling-and-hashing-technique-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Split a dataset using random sampling and hashing technique</a></span></li><li><span><a href="#Split-using-stratified-sampling" data-toc-modified-id="Split-using-stratified-sampling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Split using stratified sampling</a></span></li><li><span><a href="#Build-a-correlation-matrix-using-Pearson's-correlation-coefficient" data-toc-modified-id="Build-a-correlation-matrix-using-Pearson's-correlation-coefficient-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Build a correlation matrix using Pearson's correlation coefficient</a></span></li><li><span><a href="#Build-scatter-matrix-of-numerical-attributes" data-toc-modified-id="Build-scatter-matrix-of-numerical-attributes-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Build scatter matrix of numerical attributes</a></span></li><li><span><a href="#Imputer" data-toc-modified-id="Imputer-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Imputer</a></span></li><li><span><a href="#Modifying-a-text-categorical-column-to-integer-categorical-column" data-toc-modified-id="Modifying-a-text-categorical-column-to-integer-categorical-column-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Modifying a text categorical column to integer categorical column</a></span></li><li><span><a href="#Modifying-an-integer-categorical-column-to-one-hot-encoding" data-toc-modified-id="Modifying-an-integer-categorical-column-to-one-hot-encoding-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Modifying an integer categorical column to one hot encoding</a></span></li><li><span><a href="#Modifying-a-text-categorical-column-to-onehotencoding" data-toc-modified-id="Modifying-a-text-categorical-column-to-onehotencoding-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Modifying a text categorical column to onehotencoding</a></span></li><li><span><a href="#sklearn's-cross-validator" data-toc-modified-id="sklearn's-cross-validator-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>sklearn's cross validator</a></span></li><li><span><a href="#Display-cross-validation-scores" data-toc-modified-id="Display-cross-validation-scores-11"><span class="toc-item-num">11&nbsp;&nbsp;</span>Display cross validation scores</a></span></li></ul></div>

# Create a test set

## Split a dataset using random sampling

In [None]:
def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shufled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

## Split a dataset using random sampling and hashing technique

In [None]:
import hashlib
def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    print(ids)
    in_test_set = ids.apply(lambda id_ :test_set_check(id_, test_ratio, hash))
    print(in_test_set)
    
    return data.loc[~in_test_set], data.loc[in_test_set]

## Split using stratified sampling

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size=0.2,random_state=42)
for train_index, test_index in split.split(df_housing, df_housing["income_cat"]):
    strat_train_set = df_housing.loc[train_index]
    strat_test_set = df_housing.loc[test_index]
    
#you can also use StratifiedKFold module with 2 splits

# Correlation

## Build a correlation matrix using Pearson's correlation coefficient

In [None]:
df.corr()

## Build scatter matrix of numerical attributes

In [None]:
from pandas.plotting import scatter_matrix
#build a list of columns you want to visualize
scatter_matrix(df[columns_list], figsize=(12,8))

# Fill Missing Values

## Imputer

In [None]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = "Median")
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

# Handling categorical data

## Modifying a text categorical column to integer categorical column

In [None]:
df_column_encoded, df_column_categories = df_column.factorize()

## Modifying an integer categorical column to one hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = Encoder()
df_cat_col_1hot = encoder.fit_transform(df_cat_col.reshape(1,-1))
df_cat_col_1hot.toarray()

## Modifying a text categorical column to onehotencoding

In [None]:
from sklearn.preprocessing import CategoricalEncoder
cat_encoder = CategoricalEncoder()
df_cat_1hot = cat_encoder.fit_transform(df_cat_column.values.reshape(-1,1))
#return type is a sparce matrix
cat_encoder = CategoricalEncoder(encoding="onehot-dense")
df_cat_1hot = cat_encoder.fit_transform(df_cat_column.values.reshape(-1,1))


# Cross Validation

## sklearn's cross validator

In [None]:
#the cross validation method takes a utility function for scoring rather than a cost function
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_instance, training_features, training_label,
                        scoring = "utility_function", cv=no_of_folds)


## Display cross validation scores

In [None]:
def display_cross_val_scores(scores):
    print("SCores: ", scores)
    print("Mean: ",scores.mean())
    print("Standard Deviation: ",scores.std())

# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
grid_param_= {'kernel':['poly', 'rbf'],'C':[0.1, 1, 10]}

from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
scorer = make_scorer(f1_score)

# Create the object.
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)
# Fit the data
grid_fit = grid_obj.fit(X, y)

In [None]:
res = grid_search.cv_results_
for mean_valid_score, mean_tr_score, params in zip(res["mean_test_score"],res['mean_train_score'],res["params"]):
    print(np.sqrt(-mean_valid_score),np.sqrt(-mean_tr_score), params)

In [None]:
np.sqrt(-grid_search.best_score_)
grid_search.best_estimator_

In [None]:
#doing a cross validation over the best estimator
svm_reg = grid_search.best_estimator_
scores = cross_val_score(svm_reg, X_train_prepared, y_train, cv=5, scoring = "neg_mean_squared_error")
svm_reg_scores = np.sqrt(-scores)
print("rmse",svm_reg_scores.mean())
svm_reg_scores.std()

# Learning Curve

In [None]:
# It is good to randomize the data before drawing Learning Curves
def randomize(X, Y):
    permutation = np.random.permutation(Y.shape[0])
    X2 = X[permutation,:]
    Y2 = Y[permutation]
    return X2, Y2

X2, y2 = randomize(X, y)

In [None]:
#IMPROVE IT https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
from sklearn.model_selection import learning_curve

def draw_learning_curves(X, y, estimator, num_trainings):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X2, y2, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, num_trainings))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.grid()

    plt.title("Learning Curves")
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    plt.plot(train_scores_mean, 'o-', color="g",
             label="Training score")
    plt.plot(test_scores_mean, 'o-', color="y",
             label="Cross-validation score")


    plt.legend(loc="best")

    plt.show()