El dataset "Covertype" (https://archive.ics.uci.edu/ml/datasets/covertype) está preprocesado para Redes Neuronales, y tiene OneHotEncoding con las categóricas. Vamos a desacer ese preprocesado y vamos a guardar ese dataset en memoria de manera que lo pueda usar tanto Python como R

In [1]:
%matplotlib inline
import csv
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import pandas as pd
import feather

In [2]:
file_name = "covtype.data"

In [3]:
total_inst = 50000

In [4]:
n_quant = 10
n_p1 = 4
n_p2 = 40

quant_ind = list(range(n_quant))
p1_ind = list(range(n_quant, n_quant + n_p1))
p2_ind = [i + 14 for i in list(range(40))]
target_ind = [n_quant + n_p1 + n_p2]


In [5]:
a = pd.read_csv(file_name)

In [6]:
good_array = a.iloc[:, quant_ind]
p1_array = a.iloc[:, p1_ind]
p2_array = a.iloc[:, p2_ind]
target_array = a.iloc[:, target_ind]

In [7]:
enc1 = OneHotEncoder(categories = 'auto')
enc2 = OneHotEncoder(categories = 'auto')

In [8]:
class_p1 = np.arange(n_p1).reshape(n_p1,1)
class_p2 = np.arange(n_p2).reshape(n_p2, 1)

In [9]:
enc1.fit(class_p1)
enc2.fit(class_p2)

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [10]:
p1_cat = enc1.inverse_transform(p1_array).astype(dtype = int)
p2_cat = enc2.inverse_transform(p2_array).astype(dtype = int)

In [11]:
target = target_array - 1

In [12]:
data = np.hstack((good_array, p1_cat, p2_cat, target))

In [13]:
df = pd.DataFrame(data)

In [14]:
cols = [
    "elevation",
    "aspect",
    "slope",
    "hd_to_hydrology",
    "vd_to_hydrology",
    "hd_to_roadways",
    "hillshade_9am",
    "hillshade_noon",
    "hillshade_3pm",
    "hd_to_firepoint",
    "wilderness",
    "soil",
    "target"
]

In [15]:
df.columns = cols

In [16]:
df.soil = df.soil.astype('category')
df.wilderness = df.wilderness.astype('category')
df.target = df.target.astype('category')

In [17]:
def subset(df, n_inst):
    classes = np.unique(df.target)
    all_ind = []
    for c in classes:
        ind = np.flatnonzero(df.target == c)
        np.random.shuffle(ind)
        ind = ind[:n_inst]
        if len(ind) < n_inst:
            print("warning: not so many instances of class", c, "just", len(ind))
        all_ind.append(ind)
    h = np.concatenate(tuple(all_ind))
    np.random.shuffle(h)
    gg = df.iloc[h]
    return gg

In [18]:
csv_full_name = "full_covertype_cleaned.csv"
csv_partial_name = "4900_covertype_cleaned.csv"

In [19]:
df = df.sample(frac=1)

In [20]:
subs = subset(df, 700)

In [21]:
df.to_csv(csv_full_name, index = False)
subs.to_csv(csv_partial_name, index = False)