# jupyterthemes

In [1]:
from jupyterthemes import jtplot
# jtplot.style(theme=’monokai’, context=’notebook’, ticks=True, grid=False)

# Classification Model Evaluation

In [4]:
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN) = True Positive Rate(TPR)
# False Positive Rate(FPR) = FP / (FP + TN)
# f1-score = 2 * (Precision * recall) / (precision + recall)

from sklearn import metrics

In [None]:
# accuracy score
metrics.accuracy_score(Y_test, pred)

In [None]:
# confusion matrix

### convert confusion matrix to heatmap
def confusion_heatmap(Y_test, pred, name_mapping=None):
    labels=Y_test.unique()
    cm = metrics.confusion_matrix(Y_test, pred, labels=labels)

    mapping = []
    if(name_mapping):
        mapping = [name_mapping[l] for l in labels]
    else:
        mapping = labels
        
    sns.heatmap(cm, annot=True,  fmt='d', cmap="Blues", xticklabels = mapping , yticklabels = mapping,)
    plt.ylabel('True label',fontsize=12)
    plt.xlabel('Predicted label',fontsize=12)
    plt.show()
# name_mapping = {0:"No", 1:"Yes"}
# confusion_heatmap(Y_test, pred, name_mapping)

In [6]:
# classification report
def clf_report(Y_test, pred):
    clf_report = pd.DataFrame(metrics.classification_report(Y_test, pred, output_dict=True)).T
    clf_report["support"]["accuracy"] = clf_report.iloc[0,-1]

    def fn(x):
        return int(x) if(x%1 == 0) else "{:.2f}".format(x)
    return clf_report.style.format(fn)
# clf_report(Y_test, pred)

In [8]:
# AUC/ROC Score
metrics.roc_auc_score(Y_test, pred)

# AUC/ROC Curve
def auc_roc_curve(Y_test, pred):
    fpr, tpr, threshold = metrics.roc_curve(Y_test, pred)
    roc_auc = metrics.auc(fpr, tpr)

    plt.title('AUC/ROC Curve')
    label = "ROC score (area = {:.3f})".format(metrics.roc_auc_score(Y_test, pred))
    plt.plot(fpr, tpr, 'b', label=label)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
# auc_roc_curve(Y_test, pred)

In [10]:
# log-loss
metrics.log_loss(Y_test, pred)

In [11]:
### classification result dataframe
def clf_dataframe(knn, X_test, Y_test, pred):
    pred_prob = pd.DataFrame(knn.predict_proba(X_test), index=X_test.index)
    pred = pd.Series(pred, index=X_test.index)

    predictions = pd.concat([Y_test, pred, pred_prob], axis=1)
    predictions.columns = ["actual", "prediction", "0", "1"]
    return predictions
# dataframe = clf_dataframe(knn, X_test, Y_test, pred)
#>>> dataframe[(dataframe["Actual"] == 1) & (dataframe["new_prediction"] == 0)]


### classification result dataframe with "new_prediction" column
def clf_dataframe(knn, X_test, Y_test, pred):
    pred_prob = pd.DataFrame(knn.predict_proba(X_test), index=X_test.index)
    pred = pd.Series(pred, index=X_test.index)

    predictions = pd.concat([Y_test, pred, pred_prob], axis=1)
    predictions.columns = ["actual", "prediction", "0", "1"]
    predictions['new_prediction'] = predictions['0'].map(lambda x: 0 if x >= 0.6 else 1)
    return predictions
# dataframe = clf_dataframe(knn, X_test, Y_test, pred)
#>>> dataframe[(dataframe["Actual"] == 1) & (dataframe["new_prediction"] == 0)]

# Knn

## Important

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

### it calculate different metrics on different numbers of k values
def dist_metrics(metrics=None, ks=None):
    
    if(not metrics):
        metrics = ['canberra', 'braycurtis', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'minkowski', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'sqeuclidean']
    if(not ks):
        ks = [v for v in range(5,80,2)]
        
    final = {"k":ks}

    for m in tqdm(metrics):
        final[m] = []
        for k in ks:
            knn = KNeighborsClassifier(k, metric=m);
            scores = cross_val_score(knn, X, Y, cv=4)
            final[m].append(scores.mean())
    return pd.DataFrame(final)
# final = dist_metrics(X, Y)

#>>> final.mean(axis=0)


### It show line graph, comparing different metrics in the context of different score 
# acording to k values
def show_metrics(final):
    colors = np.random.rand(1, 4)

    for d in final.columns.drop("k"):
        plt.plot(np.arange(38), final[d], label=d)

    plt.legend(bbox_to_anchor=(1, 1), fancybox=True, shadow=True)
    plt.show();
# show_metrics(final)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

### if calculate optimal value of k(from given list of k) for each random state(
# from given list of random state)
def random_state_wise_k(X, Y, random_state=None, ks=None, metric="canberra"):
    if(not random_state):
        random_state = [s for s in range(100)]

    final = {"random_state": random_state, "k":[], "score":[]}
    ks = [k for k in range(5,80,2)]

    for r in tqdm(random_state):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/3, random_state=r);
        scores = []
        for k in ks:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric);
            knn.fit(X_train, Y_train);
            scores.append(knn.score(X_test, Y_test))
        final["k"].append(ks[scores.index(max(scores))])
        final["score"].append(max(scores))
    final = pd.DataFrame(final).sort_values(by="score", ascending=False)
    return final
# final = random_state_wise_k(X, Y)

#>>> final.k.value_counts()

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

### It will calculate score of every givn k values for every given random state
def k_wise_scores(X, Y, ks=None, metric="canberra"):
    if(not ks):
        ks = [k for k in range(5,80,2)]

    final = {}
    for k in ks:
        final[k] = []

    for r in tqdm(range(100)):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/3, random_state=r);
        for k in ks:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric);
            knn.fit(X_train, Y_train);
            final[k].append(knn.score(X_test, Y_test))
    final = pd.DataFrame(final)
    return final
# final = k_wise_scores(X, Y)


### it extracts best k value from each k value columns
def best_k_randomstate_match(final):
    best_match = { "score":[], "k":[], "random_state":[]}

    for i in final:
        best_match["score"].append(final[i].max())
        best_match["k"].append(i)
        best_match["random_state"].append(final[i].idxmax())

    return pd.DataFrame(final_1).sort_values(by="score", ascending=False)
# best_match = best_k_randomstate_match(final)

#>>> best_match.random_state.value_counts()

## Outlier Detection

In [None]:
from sklearn.neighbors import LocalOutlierFactor

### it calcute every point if it is outlier or not
# 1 means inler and -1 means outlier
def detect_outliers(X, k=20):
    lof = LocalOutlierFactor(n_neighbors=20)
    lof.fit(X)
    inout_liers = lof.fit_predict(X)
    inout_liers = pd.Series(inout_liers, index=X.index)
    return inout_liers
# inout_liers = detect_outliers(X)

#>>> inout_liers.value_counts()

#>>> inliers = inout_liers[inout_liers == 1]
#>>> outliers = inout_liers[inout_liers == -1]
#>>> data = data.loc[inliers.index]

## user defined metrics

In [None]:
def calculate(X, Y):
    return np.sum(np.subtract(X, Y)**6)

#>>> knn = KNeighborsClassifier(n_neighbors=13, metric=calculate)

# Oversampling and Undersampling

In [12]:
from sklearn.utils import resample

def data_sampling(X_train, Y_train, sampling_type=None):
    XY_train = pd.concat([X_train, Y_train], axis=1)
    
    XY_train_0 = XY_train[XY_train["Outcome"] == 0]
    XY_train_1 = XY_train[XY_train["Outcome"] == 1]
    
    if(sampling_type == "over"):
        XY_train_1_oversampled = resample(XY_train_1, n_samples=XY_train_0.shape[0], replace=True, random_state=0)
        combined = pd.concat([XY_train_0, XY_train_1_oversampled])
    elif(sampling_type == "under"):
        XY_train_0_undersampled = resample(XY_train_0, n_samples=XY_train_1.shape[0], replace=True, random_state=0)
        combined = pd.concat([XY_train_0_undersampled, XY_train_1])
    else:
        print("Provide {sampling_type} parameter")
    
    X_train = combined.iloc[:, :-1]
    Y_train = combined.iloc[:, -1]
    return X_train, Y_train
# X_train, Y_train = data_sampling(X_train, Y_train, sampling_type="over")

# Matplotlib

## Styles available

In [1]:
import matplotlib.pyplot as plt
plt.style.available
['Solarize_Light2',
 '_classic_test_patch',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'tableau-colorblind10'];

## Half Heatmap

In [None]:
corr = data.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

fig = plt.gcf()
fig.set_size_inches(8, 5)
sns.heatmap(corr, mask=mask, annot=True, cmap="coolwarm", center=0, vmin=-1, vmax=1)

## Bar chart

In [1]:
def bar_chart(data):
    unique = data.value_counts()

    colors = np.random.rand(len(unique), 3)
    fig = plt.gcf()
    fig.set_size_inches(3, 4)

    ax = unique.plot(kind='bar', fontsize=13, color=colors)
    plt.ylim(0, data.shape[0])
    plt.xticks(rotation=0, fontsize=15)

    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x(), p.get_height()), size=15)
    plt.show()

# Multiline Output

In [1]:
def MultiResults(flag=True):
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = "all" if flag==True else "last_expr"
MultiResults(True)

# Confirm Sound 

In [7]:
def complete_sound():
    import winsound
    duration = 500  # millisecond
    freq = 800  # Hz
    for i in range(5):
        winsound.Beep(freq, duration)

# Feature Correlation

In [4]:
def feature_correlation(corr):
    count = 1
    index = 0
    final = pd.DataFrame(columns=["column", "row", "value"])
    

    for column in corr.columns:
        for row in list(corr.columns)[count:]:
            final.loc[index] = [column, row, corr[row][column]]
            index += 1
        count += 1
    return final.sort_values(by="value", ascending=False)

# Missing Data 

## Show missing number

In [1]:
# return dataframe with columns name(as index), missing number and missing probability and total size
# return DataFrame will be sorted descendingly
def missing_data(data):
    missing = data.isnull().sum()
    missing = pd.DataFrame(missing, columns=["Missing ("+str(data.shape[0])+")"])
    L = len(data)
    missing["Missing_prob"] = round(missing / L * 100, 2)
    return missing.sort_values(by="Missing_prob", ascending=False)

## Show missing rows

In [7]:
null_data = data[data.isnull().any(axis=1)]

## Show total missing number

In [8]:
data.isnull().any(axis = 1).sum()

# Check value %

## check how much any value present in data

In [2]:
def check_perc(X, vals=None):
    X_dict = dict(X.value_counts())
    if(not vals):
        vals = X_dict.keys()
    percent = pd.Series(index=vals, name="Percent", dtype=float)
    total = pd.Series(index=vals, name="Total", dtype=float)
    for val in vals:
        if(val in X_dict):
            percent[val] = round(X_dict[val]/X.shape[0]*100, 2)
            total[val] = X_dict[val]
    return pd.concat([percent, total], axis=1)

# pass list in {values}

# Imputation

In [None]:
positive_data = data[data.Outcome == 1].copy()
negative_data = data[data.Outcome == 0].copy()

positive_data['Glucose'].fillna(positive_data['Glucose'].mean(), inplace = True)
positive_data['BloodPressure'].fillna(positive_data['BloodPressure'].mean(), inplace = True)
positive_data['SkinThickness'].fillna(positive_data['SkinThickness'].median(), inplace = True)
positive_data['Insulin'].fillna(positive_data['Insulin'].median(), inplace = True)
positive_data['BMI'].fillna(positive_data['BMI'].median(), inplace = True)

negative_data['Glucose'].fillna(negative_data['Glucose'].mean(), inplace = True)
negative_data['BloodPressure'].fillna(negative_data['BloodPressure'].mean(), inplace = True)
negative_data['SkinThickness'].fillna(negative_data['SkinThickness'].median(), inplace = True)
negative_data['Insulin'].fillna(negative_data['Insulin'].median(), inplace = True)
negative_data['BMI'].fillna(negative_data['BMI'].median(), inplace = True)

temp_data = pd.concat([positive_data, negative_data])

# Pandas

In [None]:
data.describe(include="all")