---

# Load Data

In [None]:
from pandas import read_csv
data = read_csv("../input/mushrooms.csv")

---

# Instantiate features

In [None]:
target = "class"
categorical_features = data.drop(["class","veil-type"],1).columns 

---

# Sort Categories by Number of Values

In [None]:
lengths = {}

for category in data.columns:
    length = len(data[category].unique())
    if length not in lengths.keys(): lengths[length] = []
    lengths[length] += [category]

In [None]:
lengths

---

# Transform Categorical Values as Binary

In [None]:
from sklearn.preprocessing import LabelEncoder, normalize
from pandas import Series, DataFrame, to_numeric

In [None]:
transformed = DataFrame()
labelencoder_by_category = {}

for category in data.columns:
    label_encoder = LabelEncoder()
    label_encoder.fit(data[category])
    labelencoder_by_category[category] = label_encoder
    transformed[category] = label_encoder.transform(data[category])
    if category == target:
        transformed[category] = transformed[category].apply(lambda x: 10 if x == 1 else -10)
    else:
        transformed[category] = transformed[category].apply(lambda x: 25 if x == 1 else -25)

---

# Biclustering vs Coclustering

In [None]:
from sklearn.cluster.bicluster import SpectralBiclustering, SpectralCoclustering

In [None]:
from numpy import argsort
from matplotlib.pyplot import show, imshow, figure, subplot, suptitle, tight_layout, xticks, xlabel

def pipeline(n, input_df):
    
    cocluster = SpectralCoclustering(n_clusters = n)
    cocluster.fit(input_df.values)
    cocluster_fit_data = input_df.values[argsort(cocluster.row_labels_)]
    cocluster_fit_data = cocluster_fit_data[:, argsort(cocluster.column_labels_)]
    
    bicluster = SpectralBiclustering(n_clusters = n)
    bicluster.fit(input_df.values)
    bicluster_fit_data = input_df.values[argsort(bicluster.row_labels_)]
    bicluster_fit_data = bicluster_fit_data[:, argsort(bicluster.column_labels_)]

    figure(figsize=(16,16))
    #suptitle("Mushrooms\n" + "n_clusters = " + str(n),fontsize=32, fontweight='bold')
    
    left_plot = subplot(121)
    ax = imshow(bicluster_fit_data, aspect='auto', cmap="bone")
    #xticks(range(0,len(input_df.columns)), list(input_df.columns[bicluster.column_labels_]),rotation='vertical')
    xlabel("Biclustering")
    
    right_plot = subplot(122, sharey=left_plot)
    ax = imshow(cocluster_fit_data, aspect='auto', cmap="bone")
    labels = list(input_df.columns[cocluster.column_labels_])
    #xticks(range(0,len(input_df.columns)), labels, rotation='vertical')
    xlabel("Coclustering")
    
    tight_layout()
    
    show()

---

# Binary Categories Only

In [None]:
binaries = transformed[lengths[2]]

In [None]:
pipeline(len(lengths[2]), binaries)

* Clear stacks are emerging, which is great for group identification.
* Class column:
    * You can see distinct different categorical value interactions by stack. 
    * More importantly, you can see which binary value combinations lead to more likelihood of whether a mushroom is poisonous or not.

---

# One-Hot Encoded Categories

In [None]:
'''
non_binaries = []
for i in [j for j in lengths.keys() if j > 2]:
    non_binaries += lengths[i]
'''

In [None]:
'''
from pandas import get_dummies,concat
binarized = concat([get_dummies(data[non_binaries]), DataFrame([LabelEncoder().fit_transform(data[category]) for category in lengths[2]], index = lengths[2]).T],1)

for category in binarized.columns:
    
    if category == target: continue
        
    binarized[category] = binarized[category].apply(lambda x: -30 if x == 0 else 30)

binarized[target] = binarized[target].apply(lambda x: -10 if x == 0 else 10)
'''

In [None]:
from numpy import argsort
from matplotlib.pyplot import show, imshow, figure, subplot, suptitle, tight_layout, xticks, xlabel

def pipeline(n, input_df):
    
    cocluster = SpectralCoclustering(n_clusters = n)
    cocluster.fit(input_df.values)
    cocluster_fit_data = input_df.values[argsort(cocluster.row_labels_)]
    cocluster_fit_data = cocluster_fit_data[:, argsort(cocluster.column_labels_)]
    
    bicluster = SpectralBiclustering(n_clusters = n)
    bicluster.fit(input_df.values)
    bicluster_fit_data = input_df.values[argsort(bicluster.row_labels_)]
    bicluster_fit_data = bicluster_fit_data[:, argsort(bicluster.column_labels_)]

    figure(figsize=(16,25))
    #suptitle("Mushrooms\n" + "n_clusters = " + str(n),fontsize=32, fontweight='bold')
    
    left_plot = subplot(211)
    ax = imshow(bicluster_fit_data, aspect='auto', cmap="cubehelix")
    #xticks(range(0,len(input_df.columns)), list(input_df.columns[bicluster.column_labels_]),rotation='vertical')
    xlabel("Biclustering")
    
    right_plot = subplot(212)
    ax = imshow(cocluster_fit_data, aspect='auto', cmap="cubehelix")
    #xticks(range(0,len(input_df.columns)), list(input_df.columns[cocluster.column_labels_]),rotation='vertical')
    xlabel("Coclustering")
    
    tight_layout()
    
    show()

In [None]:
#pipeline(len(binarized.columns),binarized)

---

# Label Encoded, Non-binary Transformation

In [None]:
transformed = DataFrame()
labelencoder_by_category = {}

for category in data.columns:
    label_encoder = LabelEncoder()
    label_encoder.fit(data[category])
    labelencoder_by_category[category] = label_encoder
    transformed[category] = label_encoder.transform(data[category])
    if category == target:
        transformed[category] = transformed[category].apply(lambda x: 12 if x == 1 else 2)
    else:
        transformed[category] = transformed[category].apply(lambda x: (x + 1))

In [None]:
pipeline(len(transformed.columns),transformed)