# **🗃️ Data Lab**

Useful datasets:

- [Students Performance](https://drive.google.com/file/d/1YwqZvaf0B7gW0cutfgD7berjkBSRVzPk/view?usp=sharing)
- [Heart Disease](https://drive.google.com/file/d/1lQ-3-dmVpJBq0eXcQp3nRgKNnn-TAP_n/view?usp=sharing)
- [Medical insurance](https://drive.google.com/file/d/1n_An4atBisD6FlO8k467Iz2sZjsujVF5/view?usp=sharing)

## Generate samples 🎯

In [None]:
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.preprocessing import StandardScaler

# @markdown \

# ============
# Parameters
# ============

n_samples = 500 # @param {type:"integer"}
type_dataset = "blobs" # @param ["noisy_circles", "noisy_moons", "blobs", "no_structure", "anisotropic", "varied_var"]
noise = 0.04 # @param {type:"slider", min:0, max:0.5, step:0.01}
angle_aniso = 100 # @param {type:"slider", min:0, max:180, step:10}
random_state = 2 # @param {type:"integer"}


# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============

if type_dataset == "noisy_circles":
  X, _ = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=noise, random_state=random_state)

elif type_dataset == "noisy_moons":
  X, _ = datasets.make_moons(n_samples=n_samples, noise=noise, random_state=random_state)

elif type_dataset == "blobs":
  X, _ = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "no_structure":
  X = np.random.rand(n_samples, 2)

elif type_dataset == "anisotropic":
  X, _ = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
  t = np.tan(np.radians(angle_aniso))
  transformation = np.array(((1, t), (0, 1))).T
  X = np.dot(X, transformation)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "varied_var":
  X, _ = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state)
  X += np.random.rand(n_samples, 2)*noise*X.min()

X = StandardScaler().fit_transform(X)

_, ax = plt.subplots(figsize=(5,4))
ax.scatter(X[:, 0], X[:, 1], edgecolors='k')

print ("\nData shape: {0} \n".format(X.shape))

## Load a dataset 📑

In [None]:
# @markdown ---

# @markdown \
# @markdown ### 🔼 Upload your file (first)
# @markdown \

# @markdown ---
# @markdown ### Enter path to **.csv* file:
file_path = "/content/heart_disease_uci.csv" # @param {type:"string"}

var_h = "age" # @param {type:"string"}
var_v = "chol" # @param {type:"string"}
labels = "" # @param {type:"string"}
normalization = "None" # @param ["MinMax [0,1]", "MinMax [-1,1]", "Z-Score", "None"]
Load_all_data = True # @param {type:"boolean"}
Remove_missing = True # @param {type:"boolean"}

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv(file_path)

if Remove_missing:
  data = data.dropna()

if not Load_all_data:
  X = np.c_[np.array(data[var_h]), np.array(data[var_v])]
else:
  X = np.array(data)

y = np.array(data[labels]) if labels != "" else None

if   normalization == "MinMax [0,1]":
  X = MinMaxScaler().fit_transform(X)
elif normalization == "MinMax [-1,1]":
  X = MinMaxScaler(feature_range=(-1,1)).fit_transform(X)
elif normalization == "Z-Score":
  X = StandardScaler().fit_transform(X)


if not Load_all_data:
  data[var_h], data[var_v] = X[:, 0], X[:, 1]
else:
  for i in range(len(data.columns)):
    data.iloc[:, i] = X[:, i]




_, ax = plt.subplots (figsize=(5,4))
# ax.scatter(X[:, 0], X[:, 1], c=y, cmap='Paired', edgecolors='k')
sns.scatterplot(ax=ax,data=data,x=var_h,y=var_v, hue=labels if labels != "" else None, palette='colorblind')
print ("\nData Loaded! ✅")
print (" - Shape: {0}\n".format(X.shape))

 # **📋 Finding annomalies**

In [None]:
# @markdown ### Boxplot

var_name = "chol" # @param {type:"string"}

def out_iqr(df , column):
    global lower,upper
    q25, q75 = np.quantile(df[column], 0.25), np.quantile(df[column], 0.75)
    # calculate the IQR
    iqr = q75 - q25
    # calculate the outlier cutoff
    cut_off = iqr * 1.5
    # calculate the lower and upper bound value
    lower, upper = q25 - cut_off, q75 + cut_off
    # print('The IQR is',iqr)
    print('The lower bound value is: {0}'.format(lower))
    print('The upper bound value is: {0}\n'.format(upper))
    # Calculate the number of records below and above lower and above bound value respectively
    df1 = df[df[column] > upper]
    df2 = df[df[column] < lower]
    return print('Total number of outliers are: {0} \n'.format(df1.shape[0]+ df2.shape[0]))

out_iqr(data,'chol')

_, axes = plt.subplots(1,2,figsize = (8,4))
sns.boxplot(y = data[var_name], ax=axes[0])

sns.distplot(data[var_name], kde=True, ax=axes[1])
plt.axvspan(xmin = lower,xmax= data[var_name].min(),alpha=0.2, color='red')
plt.axvspan(xmin = upper,xmax= data[var_name].max(),alpha=0.2, color='red')

In [None]:
# @markdown ### Standard Deviation

var_name = "chol" # @param {type:"string"}

def out_std(df, column):
    global lower,upper
    # calculate the mean and standard deviation of the data frame
    data_mean, data_std = df[column].mean(), df[column].std()
    # calculate the cutoff value
    cut_off = data_std * 3
    # calculate the lower and upper bound value
    lower, upper = data_mean - cut_off, data_mean + cut_off
    print('The lower bound value is: {0}'.format(lower))
    print('The upper bound value is: {0}\n'.format(upper))
    # Calculate the number of records below and above lower and above bound value respectively
    df1 = df[df[column] > upper]
    df2 = df[df[column] < lower]
    return print('Total number of outliers are: {0} \n'.format(df1.shape[0]+ df2.shape[0]))

out_std(data, var_name)

_, axes = plt.subplots(1,2,figsize = (8,4))
sns.distplot(data[var_name], ax=axes[0])

sns.distplot(data[var_name], kde=True, ax=axes[1])
plt.axvspan(xmin = lower,xmax= data[var_name].min(),alpha=0.2, color='red')
plt.axvspan(xmin = upper,xmax= data[var_name].max(),alpha=0.2, color='red')

In [None]:
# @markdown ### Z-Score

var_name = "chol" # @param {type:"string"}
threshold = 3.0 # @param {type:"number"}

def out_zscore(data, threshold):
    global outliers,zscore
    outliers = []
    zscore = []
    # threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    for i in data:
        z_score= (i - mean)/std
        zscore.append(z_score)
        if np.abs(z_score) > threshold:
            outliers.append(i)
    return print("Total number of outliers are",len(outliers))

out_zscore(data[var_name], threshold)

_, axes = plt.subplots(1,2,figsize = (8,4))
sns.distplot(data[var_name], ax=axes[0])

sns.distplot(zscore, kde=True, ax=axes[1])
plt.axvspan(xmin = threshold ,xmax= max(zscore),alpha=0.2, color='red')

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# @markdown ### DBSCAN (KMeans)

var1_name = "chol" # @param {type:"string"}
var2_name = "age" # @param {type:"string"}
var3_name = "" # @param {type:"string"}
eps = 20.0 # @param {type:"number"}


list_var = []

for var_name in [var1_name, var2_name, var3_name]:
  if var_name != "":
    list_var.append(var_name)

X = data[list_var].values

# X = StandardScaler().fit_transform(X)

db = DBSCAN(eps=eps, min_samples=10).fit(X)
labels = db.labels_

pd.Series(labels).value_counts()

plt.figure(figsize=(12,12))

unique_labels = set(labels)
colors = ['blue', 'red']

_, ax = plt.subplots(figsize = (5,4))

for color,label in zip(colors, unique_labels):
    sample_mask = [True if l == label else False for l in labels]
    ax.plot(X[:,0][sample_mask], X[:, 1][sample_mask], 'o', color=color);

plt.xlabel(var1_name);
plt.ylabel(var2_name);