# **I'm writing this notebook to emphasize some of the getting started tips for ML, taken from AAAMLP written by Abhishek Thakur. You'll find the link of the book at the end (i highly recommend you to read it)**

# **This Notebook contains some of the important topics like , cross validation , evaluation metrics , feature engineering and handling of categorical values...**

In [None]:
# importing almost all libraris that we need to do the preprocessing and for modelling..
import numpy as np
import pandas as pd

# for visualization
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

# for modelling 
from sklearn import datasets
from sklearn import tree # all tree based model
from sklearn import metrics # all evaluation metrics
from sklearn import model_selection # for validation
from sklearn import preprocessing # for encoding and preprocessing
from sklearn import impute # to fill missing values

# for feature engineering
from tsfresh.feature_extraction import feature_calculators as fc

# just to block warnings...
import warnings
warnings.simplefilter(action = 'ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **CROSS VALIDATION**

In [None]:
# cross validation : cross-validation is a step in the process of building a machine learning model which
#                    helps us ensure that our models fit the data accurately and also ensures that we do not overfit.

# thera are many types of cross-validation's availabel :

# 1. hold-out based validation
# 2. k-fold cross validation
# 3. stratified k-fold cross validation
# 4. leave-one-out cross validation
# 5. group k-fold cross validation

# Let's implement some of the cross validation

In [None]:
# import CSV file
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
# check how many unique values are present in quality column
df['quality'].unique()

In [None]:
# just did mapping
quality_map = {
    3:0,
    4:1,
    5:2,
    6:3,
    7:4,
    8:5
}

df.loc[:,'quality'] = df['quality'].map(quality_map)

In [None]:
# hold-out based validation

df = df.sample(frac=1).reset_index(drop=True) # for randomization
df_train = df.head(1000)
df_test = df.tail(599)

In [None]:
dec_tree = tree.DecisionTreeClassifier(max_depth = 7)

cols = [ 'fixed acidity' , 'volatile acidity' , 'citric acid' , 'residual sugar' , 'chlorides' , 'free sulfur dioxide' , 'total sulfur dioxide' , 'density' , 'pH' , 'sulphates' , 'alcohol']

dec_tree.fit(df_train[cols] , df_train['quality'])

In [None]:
train_pred = dec_tree.predict(df_train[cols])
test_pred = dec_tree.predict(df_test[cols])

train_acc = metrics.accuracy_score(df_train.quality , train_pred)
test_acc = metrics.accuracy_score(df_test.quality , test_pred)

print('train_accuracy : ' , train_acc , ' test_accuracy : ' , test_acc)

In [None]:
train_accuracy = [50]
test_accuracy = [50]

# check all the depth and select one which is more suitable
# if you don't know anything about decision Tree then 
# have look at this : https://www.kaggle.com/karad1818/all-about-decision-tree-from-scratch

for depth in range(1,25):
    dec_tree = tree.DecisionTreeClassifier(max_depth = depth)
    dec_tree.fit(df_train[cols] , df_train['quality'])
    
    train_pred = dec_tree.predict(df_train[cols])
    test_pred = dec_tree.predict(df_test[cols])

    train_acc = metrics.accuracy_score(df_train.quality , train_pred)
    test_acc = metrics.accuracy_score(df_test.quality , test_pred)
    
    train_accuracy.append(train_acc*100)
    test_accuracy.append(test_acc*100)

In [None]:
plt.figure(figsize = (10,5))
plt.plot(train_accuracy , label = 'Train')
plt.plot(test_accuracy , label = 'Test')
plt.legend(loc="upper left", prop={'size': 15})
plt.xlabel("max_depth", size=20)
plt.ylabel("accuracy in %", size=20)
plt.show()

In [None]:
if __name__ == '__main__':
    df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

    # we'll create 1 more column and fill it with -1
    df['kfold'] = -1

    df = df.sample(frac = 1).reset_index(drop = True)
    kf = model_selection.KFold(n_splits = 5)

    for fold , (train , val) in enumerate(kf.split(X = df)):
        df.loc[val , 'kfold'] = fold

    # df.to_csv('train_fold.csv' , index=False)
    # df.tail()

In [None]:
# in simple k-fold if we have 90% positive example and 10% negative example then it might possible that one fold has all negative and rest have all postive
# so to avoid that we'll use stratified k-fold in which ratio will be maintained in all folds

if __name__ == '__main__':
    df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

    # we'll create 1 more column and fill it with -1
    df['kfold'] = -1

    df = df.sample(frac = 1).reset_index(drop = True)
    kf = model_selection.StratifiedKFold(n_splits = 5)
    
    y = df.quality.values
    for fold , (train , val) in enumerate(kf.split(X = df , y=y)):
        df.loc[val , 'kfold'] = fold

    # df.to_csv('train_fold.csv' , index=False)
    # df.tail()

In [None]:
# Rule : if it's standard classification problem then use stratified K-fold
# But if we have a larger amount of data (1M) then we can go for hold-out based validation

In [None]:
# For Regression we can use all the strategy that we've discussed earlier except stratified k-fold
# to use stratified k-fold we have to divide target into bins and then we can use...
# how many bins should we create ? --> if we have 10K, 100K data then we can go for 10,20,.. bins but if we have smaller amount of data then we should follow
# Sturge's Rule : number of bins = 1 + log2(N)
def create_fold(data):
    data['kfold'] = -1
    data = data.sample(frac = 1).reset_index(drop = True)
    num_bin = int(np.floor(1 + np.log2(len(data))))
    
    data.loc[:,'bins'] = pd.cut(data['target'] , bins = num_bin , labels = False)
    kf = model_selection.StratifiedKFold(n_splits = 5)
    
    for fold , (train , value) in enumerate(kf.split(X = data , y = data.bins.values)):
        data.loc[value , 'kfold'] = fold
        
    data = data.drop('bins',axis=1)
    
    return data

if __name__ == '__main__':
    X , y = datasets.make_regression(n_samples = 10000 , n_features = 100 , n_targets = 1)
    df = pd.DataFrame(X , columns = [f"col_{i}" for i in range(X.shape[1])])
    df.loc[:,'target'] = y
    df = create_fold(df)

In [None]:
# Let's implement cross validation in MNIST : https://www.kaggle.com/karad1818/mnist

# **ALL METRICS FOR EVALUATION**

In [None]:
# Evaluation Metrics : We can check how good our model is, using evaluation metrics..

# classification Metrics :
# 1. Accuracy
# 2. Precision
# 3. Recall
# 4. F1 Score
# 5. Area under the ROC (Receiver Operating Characteristic) curve or simply AUC
# 6. log loss
# 7. Precesion at k
# 8. Average Precision at k
# 9. Mean Average Precision at k

# Regression Metrics:
# 1. Mean absolute error
# 2. Mean square error
# 3. Root mean square error
# 4. Root mean squared logarithmic error
# 5. Mean percentage error
# 6. Mean absolute percentage error
# 7. R^2


In [None]:
# In binary classification, when we have equal number of positive and equal number of negative then we'll use Accuracy , Precision , Recall and f1 score

# accuracy is just simple if 88 images got correct prediction then accuracy will be 88%

def accuracy(y_true , y_pred):
    correct = 0
    for y_t , y_p in zip(y_true , y_pred):
        if y_t == y_p:
            correct += 1
    return correct / len(y_true)

l1 = [0,1,1,1,0,0,0,1]
l2 = [0,1,0,1,0,1,0,0]

print(accuracy(l1,l2))
print(metrics.accuracy_score(l1,l2))

In [None]:
# if your dataset is skewed(number of positive is much higher then number of negative or vice versa) then it's not recommanded to use accuracy 
# In this case it's better to use precision
# Before that let's learn some terminology :

# True positive (TP) : if model predict positive and correct value is also positive , it is considered as True Positive
# True negative (TN) : if model predict negative and correct value is also negative , it is considered as True Negative
# False positive (FP) : if model predict positive and correct value is negative , it is considered as False Positive
# False negative (FN) : if model predict negative and correct value is positive , it is considered as False Negative

In [None]:
def true_positive(y_true , y_pred):
    cnt = 0
    for y_t , y_p in zip(y_true , y_pred):
        if y_t == 1 and y_p == 1:
            cnt += 1
    return cnt

def true_negative(y_true , y_pred):
    cnt = 0
    for y_t , y_p in zip(y_true , y_pred):
        if y_t == 0 and y_p == 0:
            cnt += 1
    return cnt

def false_positive(y_true , y_pred):
    cnt = 0
    for y_t , y_p in zip(y_true , y_pred):
        if y_t == 0 and y_p == 1:
            cnt += 1
    return cnt
        
def false_negative(y_true , y_pred):
    cnt = 0
    for y_t , y_p in zip(y_true , y_pred):
        if y_t == 1 and y_p == 0:
            cnt += 1
    return cnt

# accuarcy score = (TP + TN) / (TP + TN + FP + FN)

def accuracy_v2(y_true , y_pred):
    TP = true_positive(y_true , y_pred)
    TN = true_negative(y_true ,y_pred)
    FP = false_positive(y_true , y_pred)
    FN = false_negative(y_true , y_pred)
    
    return (TP + TN) / (TP + TN + FN + FP)

l1 = [0,1,1,1,0,0,0,1]
l2 = [0,1,0,1,0,1,0,0]

print(accuracy(l1,l2))
print(accuracy_v2(l1,l2))
print(metrics.accuracy_score(l1,l2))

In [None]:
# Precision : TP / (TP + FP)
# let's say in skewed data 80 out of 90 negative classified correctly and 8 out of 10 positive classified correctly
# thus our accuracy will be 88% but precision will be 8 / (8 + 10) = 44%

def precision(y_true  ,y_pred):
    TP = true_positive(y_true , y_pred)
    FP = false_positive(y_true , y_pred)
    return TP / (TP + FP)

print(precision(l1,l2))
print(metrics.precision_score(l1,l2))

In [None]:
# Recall = TP / (TP + FN)
# in above example recall = 8/ (8 + 2)

def recall(y_true , y_pred):
    TP = true_positive(y_true , y_pred)
    FN = false_negative(y_true , y_pred)
    return TP / (TP + FN)

print(recall(l1,l2))
print(metrics.recall_score(l1,l2))

# In both recall and precision we want that FN and FP should be low cause sometimes it is more penalized that some positive should clasify as negative and vice versa


In [None]:
# precision - recall curve :

# Most of the models predict probability of getting 1 or 0 in this we always choose threshold like 0.5 in most cases but it's not always useful to choose
# 0.5 as a threshold so depending on this threshold, your value of precision and recall can change drastically.

# lets's look at one example here y_pred is a probability of getting 1
y_true = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0,1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
y_pred = [0.02638412, 0.11114267, 0.31620708, 0.0490937, 0.0191491, 0.17554844, 0.15952202, 0.03819563, 0.11639273, 0.079377, 0.08584789, 0.39095342, 0.27259048, 0.03447096, 0.04644807, 0.03543574, 0.18521942, 0.05934905, .61977213, 0.33056815]

thresholds = [0.0490937 , 0.05934905, 0.079377,0.08584789, 0.11114267, 0.11639273, 0.15952202, 0.17554844, 0.18521942, 0.27259048, 0.31620708, 0.33056815, 0.39095342, 0.61977213 ]

recall_l = []
precision_l = []
for threshold in thresholds:
    temp_l = [1 if i >= threshold else 0 for i in y_pred]
    recall_l.append(metrics.recall_score(y_true , temp_l))
    precision_l.append(metrics.precision_score(y_true , temp_l))
    
plt.figure(figsize = (7,7))
plt.plot(recall_l , precision_l)
plt.xlabel('Recall' , fontsize=15)
plt.ylabel('Precision' , fontsize=15)

In [None]:
# You will notice that it’s challenging to choose a value of threshold that gives both
# good precision and recall values. If the threshold is too high, you have a smaller
# number of true positives and a high number of false negatives. This decreases your
# recall; however, your precision score will be high. If you reduce the threshold too
# low, false positives will increase a lot, and precision will be less.

# F1 score : it combines precision and recall together , it's just harmonic mean of both

# F1 : 2PR / (P + R)
# so it will converted to F1 = 2*TP / (2TP + FP + FN)

def f1(y_true , y_pred):
    p = precision(y_true , y_pred)
    r = recall(y_true , y_pred)
    return 2*p*r / (p + r)

print(f1(l1,l2))
print(metrics.f1_score(l1,l2))

# Instead of looking at precision and recall individually, you can also just look at F1
# score. Same as for precision, recall and accuracy, F1 score also ranges from 0 to 1,
# and a perfect prediction model has an F1 of 1. When dealing with datasets that have
# skewed targets, we should look at F1 (or precision and recall) instead of accuracy

In [None]:
# there are other metrics as well :
# TPR (true positive rate) which is same as recall , it is also known as sensitivity

def tpr(y_true , y_pred):
    return recall(y_true , y_pred)

# FPR (false positive rate)  = FP / (FP + TN)

def fpr(y_true , y_pred):
    FP = false_positive(y_true , y_pred)
    TN = true_negative(y_true , y_pred)
    return FP / (FP + TN)

# 1-FPR is also known as specificity or True negative rate or TNR

y_true = [0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]

# predicted probabilities of a sample being 1
y_pred = [0.1, 0.3, 0.2, 0.6, 0.8, 0.05, 0.9, 0.5, 0.3, 0.66, 0.3, 0.2, 0.85, 0.15, 0.99]

# threshold
thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.99, 1.0]

fpr_l = []
tpr_l = []
for threshold in thresholds:
    temp = [1 if i >= threshold else 0 for i in y_pred]
    fpr_l.append(fpr(y_true , temp))
    tpr_l.append(tpr(y_true , temp))

plt.figure(figsize = (10,10))
plt.fill_between(fpr_l , tpr_l , alpha = 0.4)
plt.xlim(0,1.0)
plt.ylim(0,1.0)
plt.xlabel('FPR' , fontsize=15)
plt.ylabel('TPR' , fontsize=15)
plt.plot(fpr_l,tpr_l)
plt.show()

In [None]:
# This TPR vs FPR curve is also known as receiver operating characteristic(ROC) and if we calculate area under ROC curve then it is another metrics
# known as AUC (area under curve or area under ROC curve) which is used very often when you have a dataset which has skewed binary targets. 
y_true = [0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]
y_pred = [0.1, 0.3, 0.2, 0.6, 0.8, 0.05, 0.9, 0.5, 0.3, 0.66, 0.3, 0.2, 0.85, 0.15, 0.99]

print(metrics.roc_auc_score(y_true  ,y_pred))

In [None]:
# AUC = 1 implies you have a perfect model. Most of the time, it means that
# you made some mistake with validation and should revisit data processing
# and validation pipeline of yours. If you didn’t make any mistakes, then
# congratulations, you have the best model one can have for the dataset you
# built it on.

# AUC = 0 implies that your model is very bad (or very good!). Try inverting
# the probabilities for the predictions, for example, if your probability for the
# positive class is p, try substituting it with 1-p. This kind of AUC may also
# mean that there is some problem with your validation or data processing.

# AUC = 0.5 implies that your predictions are random. So, for any binary
# classification problem, if I predict all targets as 0.5, I will get an AUC of
# 0.5.

# what does AUC say about your model ? 
# let's say of AUC is 0.85 that means if you randomely select positive sample and negative sample then positive sample will rank higher then negative sample with probability of 0.85

# you can use the ROC curve to choose this threshold!
y_true = [0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]

# predicted probabilities of a sample being 1
y_pred = [0.1, 0.3, 0.2, 0.6, 0.8, 0.05, 0.9, 0.5, 0.3, 0.66, 0.3, 0.2, 0.85, 0.15, 0.99]

# threshold
thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.99, 1.0]

tp_l = []
fp_l = []
for threshold in thresholds:
    temp = [1 if i>=threshold else 0 for i in y_pred]
    tp_l.append(true_positive(y_true , temp))
    fp_l.append(false_positive(y_true , temp))
print(tp_l)
print(fp_l)
print(thresholds)

# it's always better to choose top-left threshold from ROC curve


![](https://i.ibb.co/8XTp67v/aaaaa.png)


In [None]:
# log loss metric :
# log loss = -1 * (target * log(prediction) + (1-target)*log(1-prediction))  in logistic known as negative log likelihood

#  One thing to remember is that log loss penalizes quite high for an incorrect or a far-off prediction

def log_loss(y_true , y_pred):
    epsilon = 1e-15
    loss = []
    for y_t , y_p in zip(y_true , y_pred):
        y_p = np.clip(y_p , epsilon , 1-epsilon)
        
        temp_loss = -1.0 * (y_t * np.log(y_p) + (1 - y_t) * np.log(1 - y_p))
        loss.append(temp_loss)
    return np.mean(loss)

y_true = [0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]
y_pred = [0.1, 0.3, 0.2, 0.6, 0.8, 0.05, 0.9, 0.5, 0.3, 0.66, 0.3, 0.2, 0.85, 0.15, 0.99]

print(log_loss(y_true , y_pred))
print(metrics.log_loss(y_true , y_pred))

In [None]:
# Most of the metrics that we calculated can be generalized for multiclass classification as well.
# Let's do this for precision , 
# There are three different ways of doing it :
# 1. Macro averaged precision : calculate for each class and then take average of it
# 2. Micro averaged precision : calculate class wise TP and FP and then use that to calculate overall precision
# 3. Wighted precision : same as Macro but here we'll take weighted average depending upon number of items in each class
from collections import Counter
def macro_precision(y_true , y_pred):
    num_class = len(np.unique(y_true))
    pre = 0
    for class_ in range(num_class):
        temp_true = [1 if i == class_ else 0 for i in y_true]
        temp_pred = [1 if i == class_ else 0 for i in y_pred]
        pre += precision(temp_true , temp_pred)
    return pre / num_class

def micro_precision(y_true , y_pred):
    num_class = len(np.unique(y_true))
    TP = 0
    FP = 0
    for class_ in range(num_class):
        temp_true = [1 if i == class_ else 0 for i in y_true]
        temp_pred = [1 if i == class_ else 0 for i in y_pred]
        TP += true_positive(temp_true , temp_pred)
        FP += false_positive(temp_true , temp_pred)
    return TP / (TP + FP)
        
def weighted_precision(y_true , y_pred):
    num_class = len(np.unique(y_true))
    class_count = Counter(y_true)
    pre = 0
    for class_ in range(num_class):
        temp_true = [1 if i == class_ else 0 for i in y_true]
        temp_pred = [1 if i == class_ else 0 for i in y_pred]
        pre += class_count[class_] * precision(temp_true , temp_pred)
    return pre / len(y_true)
    
    
y_true = [0, 1, 2, 0, 1, 2, 0, 2, 2]
y_pred = [0, 2, 1, 0, 2, 1, 0, 0, 2]
print(macro_precision(y_true , y_pred))
print(metrics.precision_score(y_true , y_pred , average = 'macro'))
print(micro_precision(y_true , y_pred))
print(metrics.precision_score(y_true , y_pred , average = 'micro'))
print(weighted_precision(y_true , y_pred))
print(metrics.precision_score(y_true , y_pred , average = 'weighted'))

# similarly we can also implement recall and f1-score as well

# let's just implement weighted f1

def weighted_f1(y_true , y_pred):
    num_class = len(np.unique(y_true))
    class_count = Counter(y_true)
    f1_score = 0
    for class_ in range(num_class):
        temp_true = [1 if i == class_ else 0 for i in y_true]
        temp_pred = [1 if i == class_ else 0 for i in y_pred]
        
        p = precision(temp_true , temp_pred)
        r = recall(temp_true , temp_pred)
        
        temp_f1 = 0
        if p + r != 0:
            temp_f1 = 2*p*r / (p + r)
        
        f1_score += class_count[class_] * temp_f1
    return f1_score / len(y_true)

print(weighted_f1(y_true , y_pred))
print(metrics.f1_score(y_true , y_pred , average = 'weighted'))

# same we can do with AUC and log-loss this conversion is known as one-vs-all 


In [None]:
# In binary or multiclass classification there is another popular thing known as Confusion Matrix
# FP is also known as type - I error and FN as type - II error

# we can also expand the binary confusion matric to multiclass as well


![](http://i.ibb.co/ZX1BnTn/confused.png)

In [None]:
# A perfect confusion matrix should only be filled diagonally from left to right. 
# Note that confusion matrix that we have in scikit-learn is a transpose of what we have drawn here.

y_true = [0, 1, 2, 0, 1, 2, 0, 2, 2]
y_pred = [0, 2, 1, 0, 2, 1, 0, 0, 2]
cm = metrics.confusion_matrix(y_true , y_pred)
print(cm)
plt.figure(figsize = (5,5))
cmap = sns.cubehelix_palette(n_colors = 50 , hue = 0.05 , rot = 0 , light = 0.9 , dark = 0 , as_cmap = True)
sns.set(font_scale = 2.5)
sns.heatmap(cm , annot = True , cmap = cmap , cbar = False)
plt.xlabel('Prediction' ,fontsize=20)
plt.ylabel('Actual' , fontsize = 20)

In [None]:
# Multilabel classification : it's like one image can have multiple label associated with it

# Some of the metrics for multilabel classifications are :
# 1. Precision at k (P@k)
# 2. Average precision at k (AP@k)
# 3. Mean average precision at k (MAP@k)
# 4. log loss

# 1. Precision at k (P@k) :

# If you have a list of original classes for a given
# sample and list of predicted classes for the same, precision is defined as the number
# of hits in the predicted list considering only top-k predictions, divided by k.

def pk(y_true , y_pred , k):
    """ This function will calculate precision for only one sample"""
    if k == 0:
        return 0
    # top k prediction
    y_pred = y_pred[:k]
    
    pred_set = set(y_pred)
    true_set = set(y_true)
    comman_values = pred_set.intersection(true_set)
    return len(comman_values) / len(y_pred[:k])

# Now , Average precision at k can be calculated by takeing average of P@k

def apk(y_true , y_pred , k):
    """ This function also calculate apk for only one sample"""
    
    pk_value = []
    for i in range(1,k+1):
        pk_value.append(pk(y_true , y_pred , i))
    
    if len(pk_value) == 0:
        return 0
    return sum(pk_value) / len(pk_value)

y_true = [ [1, 2, 3], [0, 2], [1], [2, 3], [1, 0], [] ]
y_pred = [ [0, 1, 2], [1], [0, 2, 3], [2, 3, 4, 0], [0, 1, 2], [0] ]

# for i in range(len(y_true)):
#     for j in range(1,4):
#         print(f"y_true[{i}] = {y_true[i]}\ny_pred[{i}] = {y_pred[i]}\nAP@{j} = {apk(y_true[i] , y_pred[i] , j)}")

# In ML , we're interested in all samples that's why we have MAP@k

def mapk(y_true , y_pred , k):
    """This function will calculate mean average precision for all samples"""
    apk_value = []
    for i in range(len(y_true)):
        apk_value.append(apk(y_true[i] , y_pred[i] , k))
    return sum(apk_value) / len(apk_value)

for k in range(1,5):
    print(f"map@{k} = {mapk(y_true , y_pred , k)}")
    
# Please note that sometimes you might see different implementations of P@k and AP@k on the internet. 

In [None]:
# Let's move forward with regression metrics
# 1. Mean absolute error(MAE) :

def mean_absolute_error(y_true , y_pred):
    error = 0
    for y_t , y_p in zip(y_true , y_pred):
        error += np.abs(y_t , y_p)
    return error / len(y_true)

# 2. Mean squared error(MSE) :

def mean_squared_error(y_true , y_pred):
    error = 0
    for y_t , y_p in zip(y_true , y_pred):
        error += (y_t - y_p) ** 2
    return error / len(y_true)

# 3. Root mean squared error(RMSE) : most popular one  = sqrt(MSE)

# 4. Mean squared logarithmic error (MSLE) :

def mean_squared_logarithmic_error(y_true , y_pred):
    error = 0
    for y_t , y_p in zip(y_true , y_pred):
        error += (np.log(1 + y_t) - np.log(1 + y_p) ** 2)
    return error / len(y_true)

# 5. Root mean squared logarithmic error(RMSLE) : sqrt(MSLE)

# 6. Mean percentage error :

def mean_percentage_error(y_true , y_pred):
    error = 0
    for y_t , y_p in zip(y_true , y_pred):
        error += (y_t - y_p) / y_t
    return error / len(y_true)

# 7. Mean absolute percentage error :

def mean_absolute_percentage_error(y_true , y_pred):
    error = 0
    for y_t , y_p in zip(y_true , y_pred):
        error += np.abs(y_t - y_p) / y_t
    return error / len(y_true)

# 8. R^2 (R-squared / coefficient of determination): 

# In simple words, R-squared says how good your model fits the data. R-squared
# closer to 1.0 says that the model fits the data quite well, whereas closer 0 means
# that model isn’t that good. R-squared can also be negative when the model just
# makes absurd predictions.


![](http://i.ibb.co/xKZNx9D/rr.png)

In [None]:
def r2(y_true , y_pred):
    y_true_mean = np.mean(y_true)
    numerator = 0
    denominator = 0
    
    for y_t , y_p in zip(y_true , y_pred):
        numerator += (y_t - y_p) ** 2
        denominator += (y_t - y_true_mean) ** 2
    ratio = numerator / denominator
    return 1 - ratio

# So we have implemented all metrics in straightforward manner , that means they are not efficient enough .. but is's easy to understand


In [None]:
# There are some advanced metrics availabel :

# 1. Quadratic weighted kappa (QWK / Cohen's kappa) :

# QWK measures the “agreement” between two “ratings”. The ratings can be any real numbers in 0 to N. And
# predictions are also in the same range. An agreement can be defined as how close
# these ratings are to each other. So, it’s suitable for a classification problem with N
# different categories/classes. If the agreement is high, the score is closer towards 1.0.
# In the case of low agreement, the score is close to 0

y_true = [1, 2, 3, 1, 2, 3, 1, 2, 3]
y_pred = [2, 1, 3, 1, 2, 3, 3, 1, 2]

print(metrics.cohen_kappa_score(y_true , y_pred , weights = 'quadratic'))

# A QWK greater than 0.85 is considered to be very good!

# 2. Mathew's correlation coefficient (MCC) :

# MCC ranges from -1 to 1. 1 is perfect prediction, -1 is imperfect prediction, and 0 is random prediction.

![](http://i.ibb.co/f1PQpYp/mathew.png)

In [None]:
def mcc(y_true , y_pred):
    TP = true_positive(y_true , y_pred)
    FP = false_positive(y_true , y_pred)
    TN = true_negative(y_true , y_pred)
    FN = false_negative(y_true , y_pred)
    
    numerator = TP*TN - FP*FN
    denominator = ((TP + FP) * (FN + TN) * (FP + TN) * (TP + FN)) ** 0.5
    
    return numerator / denominator

#  **APPROACHING CATEGORICAL VARIABLE**

In [None]:
# two major types of categorical variable's are there
# Nominal : order is not associated for e.g. gender
# Ordinal : order is associated for e.g. something related to low,medium,high kinda level
# binary : only two category
# cyclic : days in week , sunday ,monday .... sunday ..

In [None]:
# if __name__ == '__main__':
df = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/train.csv')
df.head()

In [None]:
sns.countplot(x = df.target)

# After seeing , we can say that data is too skewed so we'll use AUC as a metric

df.ord_2.unique()

In [None]:
# now we know that ord_2 column has six different values and we know computer can not understand text so we need to convert it into numeric
# so one idea is to map it with 0,1,2...
def label(df):
    mapping = {
        'Hot' : 0,
        'Warm' : 1,
        'Freezing' : 2,
        'Lava Hot' : 3,
        'Cold' : 4,
        'Boiling Hot' : 5
    }

    df.loc[:,'ord_2'] = df['ord_2'].map(mapping)

# this type of encoding known as label encoding , same thing we can do with sklearn

In [None]:

def mapping(df):
    # fill NaN with NONE
    df.loc[:,'ord_2'] = df['ord_2'].fillna("NONE")

    label_enc = preprocessing.LabelEncoder()

    # P.S: do not use this directly. fit first, then transform
    df.loc[:,'ord_2'] = label_enc.fit_transform(df['ord_2'].values)


In [None]:
# we can use this type of encoding in many tree based algo. e.g. decision tree , random forest , extra trees , any boosted tree model
# but it's not used in linear model , SVM , neural nets as they expect data to be normalized
# for this type of model we can binarize the data
# e.g. freezing : 0 0 1
#      warm     : 0 1 0
#      cold     : 0 1 1 ...

# It becomes easy to store lots of binarized variables like this if we store them in a
# sparse format. A sparse format is nothing but a representation or way of storing
# data in memory in which you do not store all the values but only the values that
# matter. In the case of binary variables described above, all that matters is where we have ones (1s). 

# if we use simple binary things then we're using more space, e.g.

data = np.array([
    [0,0,1],
    [1,0,0],
    [1,0,1]
])
# 3*3*8
print(data.nbytes)

# another way to do this by using only 1s position
# for e.g. (0,2) , (1,0) , (2,0) , (2,2) so this will only store 4*8 memory
# in numpy we can do this , 
from scipy import sparse
sparse_data = sparse.csr_matrix(data)
print(sparse_data.data.nbytes)

# it will print 32 which is so less then dense array
# total size of sparse matrix :
print(sparse_data.data.nbytes + sparse_data.indptr.nbytes + sparse_data.indices.nbytes)
# which is 64 but it's still less then dense array , this difference become vast when data becomes larger that's why we'll prefer sparse array over dense array

In [None]:
# There is another transformation which takes much less memory then sparse matrix as well , and that is one-hot encoding
# just look at below image

data = np.array([
    [0,0,0,0,1,0],
    [0,1,0,0,0,0],
    [1,0,0,0,0,0]
])

sparse_d = sparse.csr_matrix(data)
print(sparse_d.data.nbytes + sparse_d.indptr.nbytes + sparse_d.indices.nbytes) # this is just 52

# let's implement it on big data
data = np.random.randint(1000,size=1000000)

one_hot = preprocessing.OneHotEncoder(sparse = False)
oh_data = one_hot.fit_transform(data.reshape(-1,1))
print(f"size of dense array : {oh_data.nbytes}")

one_hot = preprocessing.OneHotEncoder(sparse = True)
oh_data = one_hot.fit_transform(data.reshape(-1,1))
print(f"size of full sparse array : {oh_data.data.nbytes + oh_data.indptr.nbytes + oh_data.indices.nbytes}")

![](http://i.ibb.co/2YdTSDL/one-hot.png)

In [None]:
# those 3 methods are most important ways to handle categorical data
# there exist some other methods as well like converting data into numerical values but it doesn't make sense sometime
# df[df['ord_2'] == 'Lava Hot'].shape

# what we can do is that we can fill value with it's count
df.groupby(['ord_2'])['id'].count()

# now we can transform those counts into columns
df.groupby(['ord_2'])['id'].transform('count') # we can also group by with 2 columns and then give some count

# there is also one trick that we can combine 2 columns and create new feature

df['new_featue'] = (df['ord_1'].astype(str) + '_' + df['ord_2'].astype(str))

df.new_featue
# Not that NaN will also be converted as string and we can count it as another category


In [None]:
# so in summary , whenever we get categorical data 
# 1. fill NaN values (it's important)
# 2. convert them into labels by LabelEncoder or may be using mapping
# 3. use one-hot if needed
# 4. now go for modelling..

In [None]:
# Handling NaN values
# 1. simply drop that row (it's simple but not ideal)
# 2. Another way is to use it as a new category (this is most preferred way) :
# df.ord_2.unique()
df.ord_2.fillna("NONE").value_counts()

In [None]:
# Rare category :
# category which appear as very less percentage of total number of sample
# Now let's assume that we deployed our model and we get category that's not present in our training set ,
# so in this case our model will throw an error

# so to handle this , 
# Let's say we have f1,f2,f3 and f4 feature and we know that f3 can have rare category then we'll train our model on all category except f3
# Thus, you will be creating a model that predicts “f3” when it’s not known or not available in training

# If you have a fixed test set, you can add your test data to training to know about the
# categories in a given feature. This is very similar to semi-supervised learning in
# which you use data which is not available for training to improve your model. This
# will also take care of rare values that appear very less number of times in training
# data but are in abundance in test data. Your model will be more robust. 

# another way is that if we have NONE in training set and when we test the model and we get some unknown then we'll convert it into NONE..

# df.ord_4.value_counts()
df.ord_4 = df.ord_4.fillna('NONE')
# here we can see that J and L exist <2000 times
# so we might want that this two category will be used as RARE category

df.loc[ df['ord_4'].value_counts()[df['ord_4']].values < 2000 , 'ord_4'] = 'RARE'
df.ord_4.value_counts()

# We say that wherever the value count for a certain category is less than 2000,
# replace it with rare. So, now, when it comes to test data, all the new, unseen
# categories will be mapped to “RARE”, and all missing values will be mapped to “NONE”.

In [None]:
# so now we're done with categorical data , let's train model now..
# so here is one notebook for categorical data : https://www.kaggle.com/karad1818/categorical-data-encoding-aaamlp

# **FEATURE ENGINEERING**

In [None]:
# We must keep in mind that feature engineering is something that is done in the best possible manner only when you
# have some knowledge about the domain of the problem and depends a lot on the data in concern.

# Feature engineering is not just about creating new features from data but also includes different types of normalization and transformations.

# if we have a data with date time column for e.g.
s = pd.date_range('2020-01-06' , '2020-01-10' , freq = '10H').to_series()
# print(s)
df = pd.DataFrame()
# create feature :
feature = {
    'dayofweek' : s.dt.dayofweek.values,
    'dayofyear': s.dt.dayofyear.values,
    'month' : s.dt.month.values,
    'hour': s.dt.hour.values,
    'is_leap_year': s.dt.is_leap_year.values,
    'quarter': s.dt.quarter.values,
    'weekofyear': s.dt.weekofyear.values
}

for key , value in feature.items():
    df.loc[: , key] = value
df

In [None]:
# How to use aggregation :

# so let's say we want to claculate in which month customer is most active? or what's mean for some category for particular customer?
idx = [1,1,1,2,2,3,3,3,3,4]
month = [5,6,6,5,6,7,6,7,8,9]
num1 = [2,2,3,2,4,5,6,7,8,5]

data = pd.DataFrame({
    'id' : idx,
    'month' : month,
    'num1' : num1
})

aggs = {}
aggs['month'] = ['nunique' , 'mean']
aggs['num1'] = ['sum' , 'max' , 'mean']
aggs['id'] = ['nunique']
aggs['id'] = ['size']

agg_data = data.groupby('id').agg(aggs)
agg_data = agg_data.reset_index()

agg_data # we can also merge this data frame with original as well and do modelling....

In [None]:
# some of the statistical feature :
data = np.array([1,2,3,3,3,4,4,5,5,6,6,6,7,7,7,7])

feature_dict = {
    'mean' : np.mean(data),
    'max' : np.max(data),
    'min' : np.min(data),
    'std' : np.std(data),
    'var' : np.var(data),
    'peak-to-peak' : np.ptp(data),
    'percentile_10' : np.percentile(data,10),
    'percentile_60' : np.percentile(data,60),
    'quantile_5' : np.quantile(data , 0.05),
    'quantile_95' : np.quantile(data , 0.95),
    'quantile_99' : np.quantile(data , 0.99)
}
print(feature_dict)

In [None]:
# time series data can be converted to a lot of features
# A python library called tsfresh is instrumental in this case.

feature_dict['abs_energy'] = fc.abs_energy(data)
feature_dict['count_above_mean'] = fc.count_above_mean(data)
feature_dict['count_below_mean'] = fc.count_below_mean(data)
feature_dict['mean_abs_change'] = fc.mean_abs_change(data)
feature_dict['mean_change'] = fc.mean_change(data)
print(feature_dict)

# that's not it.. there is more...

In [None]:
# polynomial basis transformation : generally used in polynomial regression
data = pd.DataFrame(
    np.random.rand(100,2),
    columns = [f"f_{x}" for x in range(2)]
)

p_data = preprocessing.PolynomialFeatures(
    degree = 2,
    interaction_only = False, # if True this will produce feature like : x*y , x*y*z but it will not produce x2*y, x2 ,...
    include_bias = False # true then add column with all 1
)
p_data.fit(data)
poly_feature = p_data.transform(data)

num_feature = poly_feature.shape[1]
data_poly = pd.DataFrame(
    poly_feature,
    columns = [f"f_{x}" for x in range(num_feature)]
)
data_poly

In [None]:
# Another very famous feature conversion is : binning
# so in this approach we'll divide data in some parts(bins)
data['bin_10'] = pd.cut(data['f_0'] , bins=10 , labels = False)
data

# Binning also enables you to treat numerical features as categorical.

In [None]:
# we can also apply any kind of functional transformation on data
data['f_0'] = data['f_0'] * 1000

print(f"var : {np.var(data.f_0)}") # so it has a very high variance

# so to decrease the variance we'll apply log transformation
data['f_0'] = data['f_0'].apply(lambda x : np.log(1+x))

print(f"var : {np.var(data.f_0)}") 

In [None]:
# one way to fill missing values in numerical feature is to fill it with mean 

# A fancy way of filling in the missing values would be to use a k-nearest neighbour method. You can select a sample with missing values and find the nearest
# neighbours utilising some kind of distance metric, for example, Euclidean distance. Then you can take the mean of all nearest neighbours and fill up the missing value.

X = np.random.randint(1,15,(10,6)).astype(float)

# randomely assign NaN
X.ravel()[np.random.choice(X.size , 10 , replace = False)] = np.nan

knn_imputer = impute.KNNImputer(n_neighbors = 2)
knn_imputer.fit(X)
X = knn_imputer.transform(X)
X

In [None]:
# Another way of imputing missing values in a column would be to train a regression
# model that tries to predict missing values in a column based on other columns.

# Always remember that imputing values for tree-based models is unnecessary as they
# can handle it themselves.

# And always remember to scale or normalize your
# features if you are using linear models like logistic regression or a model like SVM.
# Tree-based models will always work fine without any normalization of features.

# **This whole notebook is based on Ahishek thakur's AAAMLP (Approching almost any machine learning problem..) here is a [link](https://github.com/abhishekkrthakur/approachingalmost/blob/master/AAAMLP.pdf) , i think you should look at it.**
# **This Notebook contains Half of the AAAMLP (up to 154 pages).** 
# **Thanks for reading and forking..**