# Librerias básicas #

In [None]:
import pandas as pd
import numpy as np

## One Hot encoder ##
(artículo:
https://towardsdatascience.com/what-is-one-hot-encoding-and-how-to-use-pandas-get-dummies-function-922eb9bd4970)

Cuando usar one-hot o LabelEncoder:
https://datascience.stackexchange.com/questions/9443/when-to-use-one-hot-encoding-vs-labelencoder-vs-dictvectorizor


In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder()
y_onehot = onehot_enc.fit_transform(y).toarray()

#### Otra forma más sencilla: ####

In [None]:
fuel_type = pd.get_dummies(df.FUELTYPE, prefix='FUELTYPE')

**Luego:**

In [None]:
df = pd.concat([df, fuel_type ], axis=1)

### Con etiquetas: ###
**Otra forma (asigna número a cada categoría):**


In [None]:
def encode_label(df):
    return df.astype('category').cat.codes

**Luego:**

In [None]:
df['education'] = encode_label(df['education'])

**Otra forma (si necesitamos recodificar después):**

In [None]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
encoded = encoder.fit_transform(df['workclass'])
df['workclass'] = encoded.astype('int')

- Podemos decodificar con:

In [None]:
encoder.inverse_transform(df.workclass)

## Máximo número de columnas al imprimir un dataframe ##

In [None]:
pd.options.display.max_columns = 100

**Ancho máximo de columna:**

In [None]:
pd.options.display.max_colwidth = 100

## Unir dos dataframes con mismo número de observaciones / características ##

In [None]:
df = df.join(otherDF)

## Eliminar características / observaciones ##

In [None]:
df.drop(['MAKE', 'MODEL'], axis='columns', inplace=True)

## Eliminar características / observaciones basadas en condición ##

In [None]:
df.drop(df[df['myFeature'] == ' '].index, inplace = True)

## Filtrar filas por condición ##

In [None]:
df = df[df['hours-per-week']>=40]

## Eliminar la primera fila ##

In [None]:
df = df.iloc[1:]

## Averiguar el número de nan en cada columna de un Dataframe ##

In [None]:
df.isnull().sum()
# o
df.isna().sum()

## Obtener las filas con valor Nan en una columna ##

In [None]:
df[df['column name'].isna()]
# O
df[df['column name'].isnull()]

**En cualquier columna:**

In [None]:
df[df.isna().any(axis=1)]
# O
df[df.isnull().any(axis=1)]

## Dividir dataframe en train y test ##

In [None]:
# Después de dividir el dataframe en X e y
msk = np.random.rand(len(df)) < 0.8
X_train = X[msk]
X_test = X[~msk]
y_train = y[msk]
y_test = y[~msk]

**Otra forma:**

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2) ## random_state= 5
# Después dividir train y test en X e y

**Otra forma:**

In [None]:
train=df.sample(frac=0.8,random_state=200) #random state is a seed
test=df.drop(train.index)
# Después dividir train y test en X e y

## Barajar (Shuffle) un dataframe ##

In [None]:
df.sample(frac=1)

**Para resetear los índices:**

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

## Matriz de Confusión ##

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_pred,y_test)
# Con labels
# pd_conf = pd.DataFrame(confusion, columns = 'pred '+y_test.unique(),
index = 'true '+y_test.unique())
# pd_conf

**Plot:**

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(drugTree, X_test, y_test)

**Otra forma de Plot:**

In [None]:
import seaborn as sns
ax = sns.heatmap(confusion, annot=True, cmap='YlGnBu')
ax.set_title('Seaborn Confusion Matrix with labels!!')
ax.set_xlabel('Predicted class')
ax.set_ylabel('Real Class')
ax.xaxis.set_ticklabels(['apples', 'oranges'])
ax.yaxis.set_ticklabels(['apples', 'oranges'])

### Matriz de correlación ##

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

## Semilla para random en numpy: ##

In [None]:
np.random.seed(42)

In [None]:
#Tambien podemos usar 
seed = 42
np.random.seed(seed)

## Semilla para random en Tensorflow: ##

In [None]:
tf.random.set_seed(42)

In [None]:
#Tambien podemos usar 
seed = 42
tf.random.set_seed(seed)

## Conversión de categoría a números: ##

In [None]:
df["PoolQC"].unique() ## Pare ver los valores que tomapoolQCValues = ["Excelent": 3, "Good": 2, "Bad": 1, np.nan: 0]
df[‘PoolQC’] = df["PoolQC"].map(poolQCValues)

**Otra forma:**

In [None]:
df['sex'].replace({'Male': 1, 'Female':0}, inplace=True)

**Otra forma:**

In [None]:
df["PoolQC"] = df["PoolQC"].astype(‘categpry’).cat.codes

**Otra forma:**

In [None]:
from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1])

## Número de Na en una columna: ##

In [None]:
print(“Número de Na en la columna”, df[“PoolQC”].isna().sum()

## Accuracy ##

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

# Models #

## Linear regression ##

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

## Stochastic Gradient Descent (Linear Regression) ##

In [None]:
from sklearn.linear_model import SGDRegressor
lin_reg_SGD = SGDRegressor()

## Con minibatch ##

In [None]:
from sklearn.linear_model import SGDRegressor
lin_reg_SGD = SGDRegressor()
#lin_reg = LinearRegression()
for i in range(1, 1000):
XX, YY = _get_batch(X_poly, y, batch_size = 10)
lin_reg_SGD.partial_fit(XX, YY.ravel())

## Logistic regression ##

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

## KNN ##


In [None]:
from sklearn.model_selection import cross_val_score
knnclassifier = KNeighborsClassifier(n_neighbors=4)

# Cross Validation #

In [None]:
from sklearn.model_selection import cross_val_score
knnclassifier = KNeighborsClassifier(n_neighbors=4)
print(cross_val_score(knnclassifier, x, y, cv=10, scoring ='accuracy').mean()) ## cv is the number of k-folds

## Example to compare 2 models: ##

In [None]:
from sklearn.model_selection import cross_val_score
knnclassifier = KNeighborsClassifier(n_neighbors=4)
print(cross_val_score(knnclassifier, x, y, cv=10, scoring ='accuracy').mean())


from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print (cross_val_score(logreg, x, y, cv=10, scoring = 'accuracy').mean())0.6499937460913072

## Guardar / Cargar Modelo ##

1. Guardar Modelo

In [None]:
# Guardar modelo
import pandas as pd
def save_model(model, filename):
    model.save(filename)
    model.save(filename+'.h5')
    !zip -r {filename}.zip {filename}
    history_df = pd.DataFrame(model.history.history)
    history_df.to_csv(filename+'.history.csv')


2. Cargar Modelo

In [None]:
# Cargar modelo
import pandas as pd
def load_model(filename):
    modelName = os.path.basename(filename)
    if modelName.endswith('.zip'):
        modelName = modelName[:-4]
    else:
        if modelName.endswith('.h5'):
            modelName = modelName[:-3]
    if filename.endswith('.h5'):
        model = tf.keras.models.load_model(filename)
    else:
        if filename.endswith('.zip'):
            !unzip {filename}
        else:
            !unzip {filename}.zip
        model = tf.keras.models.load_model(modelName)
    try:
        history = pd.read_csv(modelName+'.history.csv')
    except:
        history = None
    return model, history

## Guardar (salvar) un modelo en tensorflow ##

https://www.tensorflow.org/guide/keras/save_and_serialize

Hay 2 formatos (tensorflow y h5). Tensorflow genera un directorio con ficheros, pero guarda más información.

1. En formato h5

In [None]:
model.save(‘name.h5’)

2. En formato tensorflow

In [None]:
model.save(‘name_directory’)

Y se finaliza comprimiendo los archivos con :

In [None]:
!zip -r name.zip name_directory

## Cargar un modelo tensorflow ##

1. En formato h5

In [None]:
tensorflow.keras.models.load_model(‘name.h5’

2. En formato tensorflow

In [None]:
!unzip name.zip
tensorflow.keras.models.load_model(‘name_directory’)

## Z-score ##

In [None]:
From scipy import stats
data = np.array([6, 7, 7, 12, 13, 13, 15, 16, 19, 22])
stats.zscore(data)

**En estadística, un z-score nos dice a cuántas desviaciones estándar encontramos respecto al valor de la media. Usamos la siguiente fórmula para calcular un z-score:**

(In statistics, a z-score tells us how many standard deviations away a value is from the mean. We use the following formula to calculate a z-score:)

`z = (X - μ) / σ`

## Seaborn pair plot ##

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.pairplot(df, height=2.5)
plt.show()

## Escalado / normalización ##

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() ## MinMaxScaler para valores entre 0 y 1
X_std = scaler.fit_transform(X_train)

**scaler.fit()** calcula media y varianza (para después aplicar los mismos media/varianza a los conjuntos de entrenamiento y de validación).

**transform()** aplica la normalización con esa media y varianza

**fit_transform()** hace las dos cosas a la vez.

**IMPORTANTE  :** Hacer **scale.fit_transform()** sobre el train set.

Y **transform()** sobre el test set, en otro caso **el modelo “aprende” cómo debe ser la distribución del test set.**

## Métrica en modelos de clasificación ##

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred,
target_names=labels))

#### EJEMPLO ####

|  | precision| recall | f1-score | support|
| -- | -- | -- |  -- |  -- |
| Ideal| 0.58 | 0.39 |0.46 |57|
|Premium | 0.49| 0.60 |0.54 |45|
| Good| 0.68 | 0.73|0.70 |74 |
| Fair | 0.61 | 0.64 |0.62 |74|
|  |  |  |   |  |
|accuracy| |  |0.60 |250|
| macro avg| 0.59 | 0.59 |0.58 |250 |
| weighted avg| 0.60 | 0.60 |0.59|250 |


## Dibujar gráfico de un árbol de decisión ##

In [None]:
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline
dot_data = StringIO()
filename = "drugtree.png"
featureNames = my_data.columns[0:5]
targetNames = my_data["Drug"].unique().tolist()
out=tree.export_graphviz(drugTree,feature_names=featureNames,
                         out_file=dot_data,
                         class_names= np.unique(y_trainset),
                         filled=True,
                         special_characters=True,
                         rotate=False)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

## To use autocompletion in Kaggle ##

In [None]:
%config Completer.use_jedi = False

## Para saber si una variable sigue una distribución normal gausiana ##

In [None]:
from scipy import stats
print('Kurtosis:', stats.kurtosis(df['DTI']))
print('Skewness:', stats.skew(df['DTI']))

**Un valor de curtosis y/o coeficiente de asimetría entre -1 y 1, es generalmente considerada una ligera desviación de la normalidad (Bulmer, 1979), (Brown, n.d.).**

**Entre -2 y 2 hay una evidente desviación de la normal pero no extrema.**

## ¿Cuántos valores tenemos de cada clase? ##
**(Contar el número de valores diferentes)**

In [None]:
df['Ticket'].value_counts()

## Plotear los vectores soportes en SVM (SVC): ##

In [None]:
ax = plt.gca()
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=Y, s=50, cmap='autumn')
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
           linestyles=['--', '-', '--'])
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
           linewidth=1, facecolors='none', edgecolors='k')
plt.show()

## Change dataframe to numeric type ##

In [None]:
#changing column type to numeric
df = df.apply(pd.to_numeric)
df.dtypes

## Concatenar dos datasets ##

In [None]:
df = pd.concat([df1, df2], axis=1)

## PCA ##

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_pca = pca.fit_transform(X_std)

## TSNE ##

In [None]:
from sklearn.manifold import TSNE
# Project the data: this step will take several seconds
tsne = TSNE(n_components=2, init='random', random_state=42)
X_tsne = tsne.fit_transform(X_std)

# GOOGLE COLAB  #

## Liberar recursos al terminar un ejercicio: ##

In [None]:
import os, signal
os.kill(os.getpid(), signal.SIGKILL)

## Subir ficheros: ##

In [None]:
from google.colab import files
uploaded = files.upload()

**Ejemplo:**

In [None]:
for fn in uploaded.keys():
# predicting images
path = '/content/' + fn
img = image.load_img(path, target_size=(300, 300))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
images = np.vstack([x])
classes = model.predict(images, batch_size=10)
print(classes[0])
if classes[0]>0.5:
print(fn + " is a human")
else:
print(fn + " is a horse")

## Datasets interactivos ##

In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()
df

## Montar google drive ##

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

## Subir fichero ##

In [None]:
from google.colab import files
files.upload()

## Descargar fichero ##

In [None]:
from google.colab import files
files.download('path/to/your/file')