In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA, KernelPCA
from IPython.display import display
import matplotlib as mp
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib notebook

## Utiles

In [2]:
from itertools import cycle
from matplotlib import colors
from matplotlib.pyplot import cm
from matplotlib.font_manager import FontProperties

def plot_cls(column, data, legends, alpha=.5, centers=[]):
    """ Función para plotear los datos tomando como ejes "profundidad(m)", "Temperatura" y "sustrato", y agrupando
    en colores según la feature "column" tomada por parámetro """

    c = plt.get_cmap('gist_rainbow')

    name_plot = "Coloración segun {}".format(column)
    colores = iter(cm.rainbow(np.linspace(0,1, len(np.unique(data[column])) )))
    fig = plt.figure(name_plot, figsize=(11,6))
    ax = Axes3D(fig)
    ax.set_title(name_plot)
    
    grupos = data.groupby(column)
    for i, (name, group) in enumerate(grupos):
        c = next(colores)
        ax.scatter(group["Temperatura"], group["sustrato"], group["profundidad(m)"], marker='o', c=c, alpha=alpha,
                   label="{0} ({1} ejemplares)".format(legends[name], group.shape[0]))

    ax.set_xlabel("Temperatura (ºC)")
    ax.set_ylabel("Sustrato (ver tabla)")
    ax.set_zlabel("Profundidad (m)")
    if len(centers)>0:
        for i, c in enumerate(centers):
            ax.text(c[0], c[1], c[2], "c %d"%i,bbox=dict(
                    facecolor='white', alpha=.5, edgecolor='black', boxstyle='round'))

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width *.7, box.height])

    fontP = FontProperties()
    fontP.set_size('small')
    ax.legend(loc="right", prop=fontP, bbox_to_anchor=(1.4, .5))
    plt.show()

### Loading Data

In [3]:
df = pd.read_csv("test_1.csv", sep=";")

df_respaldo = df[["Estacion", "Transecto", "Fecha", "genero", "especie"]]
df_test = df.copy()
df_test[:2]

Unnamed: 0,Estacion,Estrata,Transecto,Fecha,Salinidad(ppm),Temperatura,pH,profundidad(m),sustrato,genero,especie,total
0,E1,1,I.1.1,12/11/16,33,28.33,8.5,0.5,Roca/Arena,,,0
1,E1,1,I.1.2,12/11/16,33,28.33,8.5,0.5,Roca/Arena,,,0


In [4]:
df_test.drop(axis=1, labels=["Fecha"], inplace=True)

# Unificando features "genero" y "especie"
df_test["gen/esp"] = df_test["genero"].str.cat(df_test["especie"], sep="/")
df_test["gen/esp"].replace(to_replace="Diadema /antillarum", value="Diadema/antillarum", inplace=True)
df_test.drop(axis=1, labels=["Estacion", "Transecto", "genero", "especie"], inplace=True)

In [5]:
df_test["gen/esp"].fillna("Sin Ejemplares", inplace=True)

df_t1 = df_test.copy()

maper_g = {k:v for v, k in enumerate(np.unique(df_test["gen/esp"]))}
legends_g = {v: k for k, v in maper_g.items()}

maper_s = {k:v for v, k in enumerate(np.unique(df_test["sustrato"].fillna("-")))}
legends_s = {v: k for k, v in maper_s.items()}

df_t1["gen/esp"] = df_test["gen/esp"].map(maper_g)
df_t1["sustrato"] = df_test["sustrato"].map(maper_s)

maper_t = {k:str(k)+"ºC" for k in np.unique(df_t1["Temperatura"].fillna("-"))}

In [6]:
t_sust = pd.DataFrame()
t_sust["sustratos"] = legends_s.values()
print("Tabla equivalencia Sustratos")
display(t_sust.T)
plot_cls("gen/esp", df_t1, legends_g)
plot_cls("Temperatura", df_t1, maper_t)
plot_cls("sustrato", df_t1, legends_s)


Tabla equivalencia Sustratos


Unnamed: 0,0,1,2,3,4,5,6
sustratos,Arena,Arrecife,Gravilla,Pradera,Pradera/Arena,Roca,Roca/Arena


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Preparando Datos para ExtraTrees

In [7]:
final = pd.concat([df_test, pd.get_dummies(df_test[["sustrato"]])], axis=1)
final.drop(axis=1, labels=["Estrata", "sustrato", "gen/esp"], inplace=True)
pd.options.display.max_columns = 100
# final.columns

In [8]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

Y = final["total"]
X = final.drop(axis=1, labels=["total"])

pipe = Pipeline([#("vt", VarianceThreshold(.8*(1-.8))),
                 ("sc", preprocessing.StandardScaler())
])

x_data = pd.DataFrame(pipe.fit_transform(X))
x_data.columns = X.columns
final.shape, x_data.values.shape

((98, 12), (98, 11))

In [9]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split

# pca = PCA(n_components=3, random_state=10)
# t_data = pca.fit_transform(x_data)
# print(np.sum(pca.explained_variance_ratio_))

# x_train, x_test, y_train, y_test = train_test_split(x_data, Y, test_size=.25)

forest = ExtraTreesClassifier(n_estimators=250, random_state=10)
forest.fit(x_data, Y.astype(bool))

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=1, oob_score=False, random_state=10,
           verbose=0, warm_start=False)

In [10]:
plt.figure("feature importances (según total de ejemplares colectados)")
plt.bar(range(len(forest.feature_importances_)), forest.feature_importances_, align='center')

box = plt.axes().get_position()
plt.axes().set_position([box.x0, box.y0+box.height*.3, box.width, box.height*.7])
plt.xticks(range(len(forest.feature_importances_)), x_data.columns, rotation=90)
plt.axes().set_title("feature importances (según total de ejemplares colectados)")
plt.show()

<IPython.core.display.Javascript object>

### Preparando para estimar importancia por especie

In [11]:
prb = df_test.drop(axis=1, labels=["Estrata", "total", "sustrato", "gen/esp"])
prb = pd.concat([prb, pd.get_dummies(df_test["sustrato"].fillna(0))], axis=1)
prb[:2]

Unnamed: 0,Salinidad(ppm),Temperatura,pH,profundidad(m),Arena,Arrecife,Gravilla,Pradera,Pradera/Arena,Roca,Roca/Arena
0,33,28.33,8.5,0.5,0,0,0,0,0,0,1
1,33,28.33,8.5,0.5,0,0,0,0,0,0,1


In [12]:
clases = pd.get_dummies(df_test["gen/esp"])

std = preprocessing.StandardScaler()
data = std.fit_transform(prb)
data = pd.DataFrame(data)
data.columns = prb.columns

for c in clases.columns:
    l = clases[c]
    if np.count_nonzero(l) < 4:
        continue
#     print("Total ejemplares de {}: {}".format(c, np.count_nonzero(l)))
    forest_c = ExtraTreesClassifier(n_estimators=250, random_state=10)
    forest_c.fit(data, l.astype(bool))
    
    index = range(len(forest_c.feature_importances_))
    plt.figure("{} feature importances".format(c))
    plt.bar(index, forest_c.feature_importances_, align='center')

    box = plt.axes().get_position()
    plt.axes().set_position([box.x0, box.y0+box.height*.3, box.width, box.height*.7])
    plt.axes().set_title("Feature Importances {} (ejemplares: {})".format(c, np.count_nonzero(l)))
    plt.xticks(index, x_data.columns, rotation=90)
    plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Stacked Bars

In [15]:
df_test["gen/esp"].replace(to_replace="Diadema /antillarum", value="Diadema/antillarum", inplace=True)
data_2 = df_test.copy()
# data_2.columns
# data_2.drop(values=["Sin Ejemplares"], axis=0, how="any", inplace=True)

plt.figure("Abundancia por Sustrato", figsize=(12,6))
width = .8

sustratos = np.unique(data_2["sustrato"]) 
especies = np.unique(data_2["gen/esp"])
colores = {esp:c for esp,c in zip(especies, cm.rainbow(np.linspace(0,1, len(especies)))) }

s_grupo = data_2.groupby("sustrato")

legends = {}
sust = []
dtype = [('name', "U30"), ('num', float)]

for ind, name in enumerate(s_grupo.groups.keys()):
    sust.append(name)
    e_grupo = s_grupo.get_group(name).groupby("gen/esp")
    base = 0

    valores_d = []
    for especie in e_grupo.groups.keys():
#         if especie == "Sin Ejemplares":
#             continue
        valores_d.append((especie, np.sum(e_grupo.get_group(especie)["total"])))

    valores_d = np.array(valores_d, dtype=dtype)
    valores_d.sort(order="num")
    valores = valores_d[::-1]
        
    for (esp, cant) in valores:
#         cant = np.sum(e_grupo.get_group(esp)["total"])
        bar = plt.barh(ind, cant, width, color=colores[esp], left=base, align="center")#, yerr=menStd)
        legends[esp] = bar
        base = base + cant
    
plt.xlabel('Nro de Individuos')

#     box = plt.axes().get_position()
#     plt.axes().set_position([box.x0, box.y0+box.height*.3, box.width, box.height*.7])
plt.axes().set_title("Abundancia por Sustrato")
plt.yticks(range(len(sust)), sust)

fontP = FontProperties()
# fontP.set_size('small')
plt.legend( (legends.values()), (legends.keys()), loc="right", prop=fontP, bbox_to_anchor=(1, .7))
plt.show()

<IPython.core.display.Javascript object>

## Intentando Clustering

In [None]:
def desagrupar(df):
    new_df = pd.DataFrame(columns=df.columns)    
    for i, row in df.iterrows():
        ind = int(row["total"])
        for i in range(ind):
            new_df = new_df.append(row, ignore_index=True)
#     display(new_df)
#     new_df.columns = df.columns 
    return new_df.drop(axis=1, labels=["total"])

In [29]:
test_cls = desagrupar(df)

# Unificando features "genero" y "especie"
test_cls["gen/esp"] = test_cls["genero"].str.cat(test_cls["especie"], sep="/")
test_cls["gen/esp"].replace(to_replace="Diadema /antillarum", value="Diadema/antillarum", inplace=True)
test_cls.drop(axis=1, labels=["Estacion", "Estrata", "Fecha", "Transecto", "genero", "especie"], inplace=True)

test_cls

from sklearn import preprocessing

maper_esp = {k:n for n,k in enumerate(np.unique(test_cls["gen/esp"]))}
maper_sus = {k:n for n,k in enumerate(np.unique(test_cls["sustrato"]))}
test_cls["sustrato"] = test_cls["sustrato"].map(maper_sus)
clases = test_cls["gen/esp"].map(maper_esp)
test_cls = test_cls.drop(axis=1, labels=["gen/esp"])

std = preprocessing.StandardScaler()

dat = std.fit_transform(test_cls)

mod = KMeans(random_state=20, n_clusters=9)
cls = mod.fit_predict(dat)

In [33]:
dibujar(dat, cls)

10


<IPython.core.display.Javascript object>

In [31]:
def dibujar(dat, lab):
#     c = plt.get_cmap('gist_rainbow')

    name_plot = "Clusters"
#     colores = iter(cm.rainbow(np.linspace(0,1, len(np.unique(lab)) )))
    colores = cm.rainbow(np.linspace(0,1, len(np.unique(lab))+1 ))
    print(len(colores))
    fig = plt.figure(name_plot, figsize=(11,6))
    ax = Axes3D(fig)
    ax.set_title(name_plot)
    
    cols = [colores[c] for c in lab]
    
    ax.scatter(dat[:,0], dat[:,1], dat[:,2], marker='o', c=cols, alpha=.5)
#                    label="{0} ({1} ejemplares)".format(legends[name], group.shape[0]))
    plt.show()

In [32]:
from sklearn.metrics import confusion_matrix

std = preprocessing.StandardScaler()
std_data = std.fit_transform(x_data)

pca = PCA(n_components=8, random_state=10)
pca_data = pca.fit_transform(std_data)
print(np.sum(pca.explained_variance_ratio_))

x_train, x_test, y_train, y_test = train_test_split(pca_data, Y.astype(bool), test_size=.25)

NameError: name 'x_data' is not defined

In [40]:
from sklearn.svm import SVC

model = SVC()
model.fit(x_train, y_train)

cl = model.predict(x_test)
confusion_matrix(y_test, cl)

array([[15,  1],
       [ 1,  8]])