#Imbalanced Data

Imbalanced data typically refers to a problem with classification problems where the classes are not represented equally.

For example, you may have a 2-class (binary) classification problem with 100 instances (rows). A total of 80 instances are labeled with Class-1 and the remaining 20 instances are labeled with Class-2.

This is an imbalanced dataset and the ratio of Class-1 to Class-2 instances is 80:20 or more concisely 4:1.

You can have a class imbalance problem on two-class classification problems as well as multi-class classification problems. Most techniques can be used on either.
https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/

![](https://miro.medium.com/max/450/1*zsyN08VVrgHbAEdvv27Pyw.png)medium.com

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
nRowsRead = 1000  # specify 'None' if want to read whole file
data = pd.read_csv(
    "../input/cusersmarildownloadsfostercsv/foster.csv",
    delimiter=";",
    encoding="utf8",
    nrows=nRowsRead,
)
data.dataframeName = "foster.csv"
nRow, nCol = data.shape
print(f"There are {nRow} rows and {nCol} columns")
data.head()

#Codes by Marco Carujo. https://www.kaggle.com/mcarujo/churn-prediction-ann-over-under-sampling

In [None]:
!pip install nb_black -q

In [None]:
%load_ext nb_black

In [None]:
import plotly.express as px
import plotly.figure_factory as ff
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
# To plot numerical column
def plot_hist(data, column):
    fig = px.histogram(data, x=column, color="RegionID")
    fig.show()
    fig = ff.create_table(pd.DataFrame(data[column].describe()).T)
    fig.show()


# To plot categorical column
def plot_count(data, column):
    df = data.groupby(column)["RegionID"].value_counts()
    df = pd.DataFrame(df)
    df.columns = ["Count"]
    df.reset_index(inplace=True)
    fig = px.bar(
        df, x=column, y="Count", color="RegionID", text="Count", barmode="group"
    )
    fig.show()

In [None]:
plot_hist(data, "RegionID")

In [None]:
plot_count(data, "AreaID")

In [None]:
plot_count(data, "Area")

In [None]:
plot_hist(data, "Jan")

In [None]:
plot_hist(data, "Dec")

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

enc = OneHotEncoder(handle_unknown="ignore")
stander_scaler = StandardScaler()
label_encoder = LabelEncoder()

X = np.concatenate(
    (
        ## OneHotEncoder
        enc.fit_transform(data[["Area"]]).toarray(),
        ## Stander Scaler
        stander_scaler.fit_transform(
            data[
                [
                    "Jan",
                    "Feb",
                    "Mar",
                    "Apr",
                    "May",
                    "Jun",
                    "Jul",
                    "Aug",
                    "Sep",
                    "Oct",
                    "Nov",
                    "Dec",
                ]
            ]
        ),
        ## LabelEncoder
        label_encoder.fit_transform(data[["Area"]]).reshape(-1, 1),
        ## No formatation
        data[["RegionID", "AreaID"]].values,
    ),
    axis=1,
)

y = data.RegionID.values
X.shape

In [None]:
columns = (
    [el for el in enc.categories_[0]]
    + [
        "Jan",
        "Feb",
        "Mar",
        "Apr",
        "May",
        "Jun",
        "Jul",
        "Aug",
        "Sep",
        "Oct",
        "Nov",
        "Dec",
    ]
    + ["Area"]
    + ["RegionID", "AreaID"]
    + ["RegionID"]
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

table = pd.DataFrame(np.concatenate([X, y.reshape(-1, 1)], axis=1))
table.columns = columns
table = table.corr()
with sns.axes_style("white"):
    mask = np.zeros_like(table)
    mask[np.triu_indices_from(mask)] = True
    plt.figure(figsize=(10, 10))
    sns.heatmap(
        round(table, 2),
        cmap="Reds",
        mask=mask,
        vmax=table.max().max(),
        vmin=table.min().min(),
        linewidths=0.5,
        annot=True,
        annot_kws={"size": 12},
    ).set_title("Correlation Matrix App behavior dataset")

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D
import keras


def get_model():
    return Sequential(
        [
            Dense(units=200, input_dim=12, activation="relu"),
            Dense(150, activation="relu"),
            Dropout(0.2),
            Dense(100, activation="relu"),
            Dense(100, activation="relu"),
            Dropout(0.2),
            Dense(100, activation="relu"),
            Dense(100, activation="relu"),
            Dense(100, activation="relu"),
            Dropout(0.2),
            Dense(100, activation="relu"),
            Dense(1, activation="sigmoid"),
        ]
    )


def train_ann(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42
    )

    model = get_model()

    model.compile(
        optimizer="adam", loss="binary_crossentropy", metrics=["mse", "accuracy"],
    )

    # Trainig and returning back the results.
    history = model.fit(
        X_train,
        y_train,
        batch_size=10,
        epochs=50,
        verbose=0,
        validation_data=(X_test, y_test),
    )
    loss, mse, acc = model.evaluate(X_test, y_test, verbose=0)
    fig = ff.create_table(
        pd.DataFrame([(loss, mse, acc)], columns=["Loss", "MSE", "Accuracy"]),
    )
    fig.show()

In [None]:
from imblearn.over_sampling import SMOTE

X, y = SMOTE(random_state=42).fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.over_sampling import RandomOverSampler

X, y = RandomOverSampler(random_state=42).fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

X, y = BorderlineSMOTE(random_state=42).fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.over_sampling import ADASYN

X, y = ADASYN(random_state=42).fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.over_sampling import KMeansSMOTE

X, y = KMeansSMOTE(random_state=42).fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.over_sampling import SVMSMOTE

X, y = SVMSMOTE(random_state=42).fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.under_sampling import ClusterCentroids

X, y = ClusterCentroids(random_state=42).fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.under_sampling import AllKNN

X, y = AllKNN().fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

X, y = NeighbourhoodCleaningRule().fit_resample(X, y)

train_ann(X, y)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

X, y = RandomUnderSampler().fit_resample(X, y)

train_ann(X, y)

Added to my list: learn how to deal with imbalanced data.