# Data description

Notebook used for redundant columns removal. Please note than the cells should not be able to run as data were not provided.

In [3]:
from utils import *

In [4]:
DATA_AVAILABLE = False

In [11]:
if DATA_AVAILABLE:
    charged_off = pd.read_csv('data\\Charged_Off.csv')
    marked_frauds = pd.read_csv('data\\Frauds.csv')

In [189]:
import networkx as nx
from itertools import combinations
def reduce_by_cluster(clusters, df):
    """
    Given a clusters of features (list(list)),
    it takes the first feature from each nested list and drops
    reamining features from the dataset.
    """
    to_drop = set()
    for cluster in clusters:
        cluster_list = list(cluster)
        for feature in cluster_list[1:]:
            df.loc[df[cluster_list[0]].isnull(), cluster_list[0]] = df[feature]
            to_drop.add(feature)
    for d in to_drop:
        df = df.drop(d, axis=1)
    return df


def get_idential(df):
    """
    Get pair of features with identical values
    """
    return [(i, j) for i, j in combinations(df, 2) if df[i].equals(df[j])]


def get_almost_identical(df):
    """
    Get features with almost identical values
    """
    return [(i, j) for i, j in combinations(df, 2)
            if abs(df[i].sub(df[j]).dropna().round(0).value_counts(normalize=True).max()) > 0.9]


def reduce_dataset(df):
    """
    Reduce dataset by merging idential and almost identical features
    """
    ide = get_idential(df)
    cl = make_clusters(ide)
    d = reduce_by_cluster(cl, df)
    almost_ide = get_almost_identical(d)
    cl = make_clusters(almost_ide)
    return reduce_by_cluster(cl, d)


def make_clusters(identicals):
    """
    Make clusters from pairs of identical features
    by finding connected components in a graph.
    """
    edges = identicals
    g = nx.Graph(edges)
    return list(nx.connected_components(g))

In [17]:
if DATA_AVAILABLE:
    charged_off = reduce_dataset(charged_off)

In [192]:
if DATA_AVAILABLE:
    charged_off = charged_off.drop('Stmt_Header_Id', axis=1)
    charged_off.to_csv('data\\Charged_Off_Reduced.csv')

In [16]:
if DATA_AVAILABLE:
    marked_frauds = reduce_dataset(marked_frauds)

In [195]:
if DATA_AVAILABLE:
    marked_frauds = marked_frauds.drop('Stmt_Header_Id', axis=1)
    marked_frauds.to_csv('data\\Frauds_Reduced.csv')