In [1]:
import pandas as pd
from collections import Counter
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
# Data is being read from internet
iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)

In [3]:
# Adding our own header as describted in iris.data from the reference link provided.
iris.columns = ['sepal_l', 'sepal_w', 'petal_l', 'petal_w', 'type']

In [4]:
iris.describe()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [10]:
# Function for calculating new merge intervals.
def calculateMergeIntervals(merge_intervals,min_index):
    new_merge_intervals = [] # Prepare for the merged new input array
    skip = False
    done = False
    for i in range(len(merge_intervals)):
        if skip:
            skip = False
            continue
        if i == min_index and not done: # Merge the merge_intervals
            t = merge_intervals[i] + merge_intervals[i+1]
            new_merge_intervals.append([min(t), max(t)])
            skip = True
            done = True
        else:
            new_merge_intervals.append(merge_intervals[i])
    return new_merge_intervals

"""Chi Merge function that merge the adjacent least chi-square values till the 
    required stopping criteria is met.
    
    In this algorithm, we merge every 2 adjacent least chi-squared intervals.
"""      

def chi_merge(input, features, data_label, maximum_intervals):
    # sorts the distinct values from features.
    dist_values = sorted(set(input[features])) 
    # Get all possible data_labels like ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
    data_labels = sorted(set(input[data_label]))
    # Counter() is for padding which is a Helper function
    empty_values_counter = {l: 0 for l in data_labels} 
    # for each feature initialize the merge_intervals 
    merge_intervals = [[dist_values[i], dist_values[i]] for i in range(len(dist_values))]
    # While loop --- maximum_intervals = stop criteria i.e., 6.
    while len(merge_intervals) > maximum_intervals: 
        chiSquare = []
        for i in range(len(merge_intervals)-1):
            # Calculating the Chi-square value
            observation1 = input[input[features].between(merge_intervals[i][0], merge_intervals[i][1])]
            observation2 = input[input[features].between(merge_intervals[i+1][0], merge_intervals[i+1][1])]
            total = len(observation1) + len(observation2)
            count1 = np.array([v for i, v in {**empty_values_counter, **Counter(observation1[data_label])}.items()])
            count2 = np.array([v for i, v in {**empty_values_counter, **Counter(observation2[data_label])}.items()])
            count_total = count1 + count2
            expected1 = count_total*sum(count1)/total
            expected2 = count_total*sum(count2)/total
            # performing Chi-Square computation. (observation-expected)2/expected.             
            chiValue = (count1 - expected1)**2/expected1 + (count2 - expected2)**2/expected2
            # To deal with counts that are ZERO
            chiValue = np.nan_to_num(chiValue) 
            # Here we do the Chi-square summation
            chiSquare.append(sum(chiValue)) 
        # Find the minimal Chi-square value for current iteration
        min_chi = min(chiSquare) 
        for i, v in enumerate(chiSquare):
            if v == min_chi:
                # Find the index of the interval to be merged
                min_index = i 
                break
        # Calculating the new intervals.
        merge_intervals = calculateMergeIntervals(merge_intervals,min_index)
    for i in merge_intervals:
        print('[', i[0], ',', i[1], ']', sep='')

In [11]:
for attribute in ['sepal_l', 'sepal_w', 'petal_l', 'petal_w']:
    print("======================================")
    print('After CHIMERGE intervals for', attribute.upper())
    print("======================================")
    chi_merge(input=iris, features=attribute, data_label='type', maximum_intervals=6)

After CHIMERGE intervals for SEPAL_L
[4.3,4.8]
[4.9,4.9]
[5.0,5.4]
[5.5,5.7]
[5.8,7.0]
[7.1,7.9]
After CHIMERGE intervals for SEPAL_W
[2.0,2.2]
[2.3,2.4]
[2.5,2.8]
[2.9,2.9]
[3.0,3.3]
[3.4,4.4]
After CHIMERGE intervals for PETAL_L
[1.0,1.9]
[3.0,4.4]
[4.5,4.7]
[4.8,4.9]
[5.0,5.1]
[5.2,6.9]
After CHIMERGE intervals for PETAL_W
[0.1,0.6]
[1.0,1.3]
[1.4,1.6]
[1.7,1.7]
[1.8,1.8]
[1.9,2.5]
