In [1]:
import sys
sys.path.append('./../')

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

#import src
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

from src.features.build_features import clean
from src.models.train_model import get_data, evaluate


from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import monotonically_increasing_id, countDistinct, approxCountDistinct, when

from pyspark.ml.feature import OneHotEncoder, StringIndexer, Imputer, VectorAssembler, StandardScaler, PCA
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from collections import defaultdict

import pandas as pd

In [2]:
def imputa_categoricos(df, ignore,data_types):
    strings_used = [var for var in data_types["StringType"] if var not in ignore]

    missing_data_fill = {}
    for var in strings_used:
        missing_data_fill[var] = "missing"

    df = df.fillna(missing_data_fill)
    return df

def ignore_list(df, data_types):
    counts_summary = df.agg(*[countDistinct(c).alias(c) for c in data_types["StringType"]])
    counts_summary = counts_summary.toPandas()

    counts = pd.Series(counts_summary.values.ravel())
    counts.index = counts_summary.columns

    sorted_vars = counts.sort_values(ascending = False)
    ignore = list((sorted_vars[sorted_vars >100]).index)
    return ignore

def get_data_types(df):
    data_types = defaultdict(list)
    for entry in df.schema.fields:
        data_types[str(entry.dataType)].append(entry.name)
    return data_types



In [3]:
def create_pipeline(df, ignore):
    # Esto lo ponemos aqui para poder modificar las 
    #variables de los estimadores/transformadores
    data_types = get_data_types(df)    
    #--------------------------------------
    
    # -------------- STRING --------------
    strings_used = [var for var in data_types["StringType"] if var not in ignore]

    # -------------- DOUBLE --------------
    numericals_double = [var for var in data_types["DoubleType"] if var not in ignore]
    numericals_double_imputed = [var + "_imputed" for var in numericals_double]

    # -------------- INTEGERS --------------
    from pyspark.sql.types import IntegerType, DoubleType
    numericals_int = [var for var in data_types["IntegerType"] if var not in ignore]
    
    for c in numericals_int:
        df = df.withColumn(c, df[c].cast(DoubleType()))
        df = df.withColumn(c, df[c].cast("double"))
        
    numericals_int_imputed = [var + "_imputed" for var in numericals_int]
    # =======================================

    ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    ##            P I P E L I N E
    ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    # ============= ONE HOT ENCODING ================
    from pyspark.ml.feature import OneHotEncoder, StringIndexer
    stage_string = [StringIndexer(inputCol= c, outputCol= c+"_string_encoded") for c in strings_used]
    stage_one_hot = [OneHotEncoder(inputCol= c+"_string_encoded", outputCol= c+ "_one_hot") for c in strings_used]

    # =============== IMPUTADORES ====================
    from pyspark.ml.feature import Imputer
    stage_imputer_double = Imputer(inputCols = numericals_double, 
                                   outputCols = numericals_double_imputed) 
    stage_imputer_int = Imputer(inputCols = numericals_int, 
                                outputCols = numericals_int_imputed) 

    # ============= VECTOR ASESEMBLER ================
    from pyspark.ml.feature import VectorAssembler

    features =  numericals_double_imputed \
              + [var + "_one_hot" for var in strings_used]
    stage_assembler = VectorAssembler(inputCols = features, outputCol= "assem_features")

    # ==================== SCALER =======================
    from pyspark.ml.feature import StandardScaler
    stage_scaler = StandardScaler(inputCol= stage_assembler.getOutputCol(), 
                                  outputCol="scaled_features", withStd=True, withMean=True)

    # ================== PIPELINE ===================
    stages= stage_string + stage_one_hot +  [             # Categorical Data
                               stage_imputer_double,
                               stage_imputer_int,        # Data Imputation
                               stage_assembler,          # Assembling data
                               stage_scaler] 
                          
    ## Tenemos que regesar el df porque las variables int las combierte en double
    return  stages , df

In [4]:
def get_models_params_dic():
    stage_pca = PCA(k = 15,inputCol = "scaled_features",
                        outputCol = "features")


    lr = LogisticRegression()

    lr_paramGrid = ParamGridBuilder() \
    .addGrid(stage_pca.k, [1]) \
    .addGrid(lr.maxIter, [1]) \
    .build()

    dt = DecisionTreeClassifier()

    dt_paramGrid = ParamGridBuilder() \
    .addGrid(stage_pca.k, [1]) \
    .addGrid(dt.maxDepth, [2]) \
    .build()

    paramGrid_dic= {"LR":lr_paramGrid,"DT":dt_paramGrid}
    model_dic = {"LR":lr,"DT":dt}

    return model_dic,paramGrid_dic

In [5]:
def prepare_data(df):
    data_types = get_data_types(df)
    ignore =   ignore_list(df, data_types) 
    illegal = [s for s in df.columns if "del" in s]
    extra_illegal = ['cancelled', 'rangoatrasohoras']
    legal = [var for var in df.columns if (var not in ignore and var not in illegal and var not in extra_illegal)]
    lista_objetivos = df.select('rangoatrasohoras').distinct().rdd.map(lambda r: r[0]).collect()

    df = imputa_categoricos(df, ignore,data_types)
    
    df_legal = df[legal]
    y = df[['rangoatrasohoras']]
    
    df_legal = df_legal.withColumn("id", monotonically_increasing_id())
    y = y.withColumn("id", monotonically_increasing_id())
    
    stages, df_new = create_pipeline(df_legal, ignore)

    df_junto = df_new.join(y, "id", "outer").drop("id")

    return df_junto, stages
    

    
    
def run_model(objetivo, model_name, hyperparams, luigi= False):
    df = get_data(luigi)
    df, first_stages = prepare_data(df)
  
    df = df.withColumn("label",  when(df.rangoatrasohoras == objetivo, 1.0).otherwise(0.0))
    
    # Selecciona el modelo
    model_dic, paramGrid_dic  = get_models_params_dic()
    clr_model = model_dic[model_name]
    
    # Parametros especificos
    num_it = int(hyperparams["iter"])
    if num_it > 0:
        clr_model.setMaxIter(num_it)
    
    # Adds new stages
    num_pca = int(hyperparams["pca"])
    if num_pca > 0:
        stage_pca = PCA(k = num_pca,inputCol = "scaled_features", 
                            outputCol = "features")
    else:
        stage_pca = PCA(k = 8,inputCol = "scaled_features", 
                    outputCol = "features")
    
    # Checar que no se haya corrido este modelo 
    
    print("Modelo evaluado: ", clr_model, "con params: ", clr_model.explainParams())
    
    # Creates Pipeline
    pipeline = Pipeline(stages= first_stages + [stage_pca, clr_model])

    df_train, df_test = df.randomSplit([0.8,0.2], 123)

    cvModel  = pipeline.fit(df_train)
    prediction = cvModel.transform(df_test)
    evaluate(prediction)
    
    #Sacar metadatos
    #Insertar metadatos
    #Guardar modelos
    

In [39]:
objetivo = "cancelled"
model = "LR"
hyperparams = {"pca": 1, "iter":1}

run_model(objetivo, model, hyperparams)

/home/jovyan/work/src/models
Modelo evaluado:  LogisticRegression_da34f1229d49 con params:  aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on interc

In [6]:
objetivo = "cancelled"
model_name = "LR"
hyperparams = {"pca": 1, "iter":1}

df = get_data(False)
df, first_stages = prepare_data(df)

df = df.withColumn("label",  when(df.rangoatrasohoras == objetivo, 1.0).otherwise(0.0))

# Selecciona el modelo
model_dic, paramGrid_dic  = get_models_params_dic()
clr_model = model_dic[model_name]

# Parametros especificos
num_it = int(hyperparams["iter"])
if num_it > 0:
    clr_model.setMaxIter(num_it)

# Adds new stages
num_pca = int(hyperparams["pca"])
if num_pca > 0:
    stage_pca = PCA(k = num_pca,inputCol = "scaled_features", 
                        outputCol = "features")
else:
    stage_pca = PCA(k = 8,inputCol = "scaled_features", 
                outputCol = "features")

# Checar que no se haya corrido este modelo 

print("Modelo evaluado: ", clr_model, "con params: ", clr_model.explainParams())

# Creates Pipeline
pipeline = Pipeline(stages= first_stages + [stage_pca, clr_model])

df_train, df_test = df.randomSplit([0.8,0.2], 123)

cvModel  = pipeline.fit(df_train)
prediction = cvModel.transform(df_test)
evaluate(prediction)

/home/jovyan/work/src/models
Modelo evaluado:  LogisticRegression_f943fad70126 con params:  aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on interc

{'AUROC': '0.623870',
 'AUPR': '0.017011',
 'precision': '0.9858980702622464',
 'recall': '0.9858980702622464',
 'F1 Measure': '0.9858980702622464',
 0.0: {'precision': '0.9858980702622464',
  'recall': '1.0',
  'F1 Measure': '0.9886865479085'},
 1.0: {'precision': '0.0', 'recall': '0.0', 'F1 Measure': '0.0'}}

## Save model

In [57]:
from src.models.save_model import save_upload



Nueva cubeta


In [59]:
save_upload(cvModel, objetivo, model_name, hyperparams,bucket_name = "models-dpa")

22042020_cancelled_LR_=#pca#-%1$%#iter#-%1&.model.zip file is created successfully!


In [62]:
new_saved_model = "./22042020_cancelled_LR_=#pca#-%1$%#iter#-%1&.model.zip"
folder = new_saved_model[:-4]
folder

'./22042020_cancelled_LR_=#pca#-%1$%#iter#-%1&.model'

In [63]:
import shutil
new_saved_model = "./22042020_cancelled_LR_=#pca#-%1$%#iter#-%1&.model.zip"
folder = new_saved_model[:-4]
shutil.rmtree(folder, ignore_errors=True)
os.remove(new_saved_model) 

In [64]:
os.remove(new_saved_model) 

## Experimentos

In [19]:
!pip install boto3
from src.utils.s3_utils import describe_s3, get_s3_objects

Collecting boto3
  Downloading boto3-1.12.43-py2.py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 701 kB/s eta 0:00:01
[?25hCollecting s3transfer<0.4.0,>=0.3.0
  Downloading s3transfer-0.3.3-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 1.0 MB/s eta 0:00:01
[?25hCollecting botocore<1.16.0,>=1.15.43
  Downloading botocore-1.15.43-py2.py3-none-any.whl (6.1 MB)
[K     |████████████████████████████████| 6.1 MB 74 kB/s  eta 0:00:01
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.9.5-py2.py3-none-any.whl (24 kB)
Collecting docutils<0.16,>=0.10
  Downloading docutils-0.15.2-py3-none-any.whl (547 kB)
[K     |████████████████████████████████| 547 kB 47 kB/s  eta 0:00:01
Installing collected packages: docutils, jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.12.43 botocore-1.15.43 docutils-0.15.2 jmespath-0.9.5 s3transfer-0.3.3
Nueva cubeta
models-dpa
models-dpa


In [53]:
trained_model = cvModel.stages[-1]

In [54]:
type(trained_model)

pyspark.ml.classification.LogisticRegressionModel

In [12]:
import json 

In [32]:
import json 
from datetime import date

def parse_filename(objetivo, model_name, hyperparams):    
    para_string = json.dumps(hyperparams) 
    para_string = para_string.replace(" ", "%")
    para_string = para_string.replace('"', "#")
    para_string = para_string.replace('}', "&")
    para_string = para_string.replace('{', "=")
    para_string = para_string.replace(':', "-")
    para_string = para_string.replace(',', "$")
    
    today = date.today()
    d1 = today.strftime("%d%m%Y")

    saved_model_name = "./" + d1 + "_" + objetivo + "_" + model_name + "_" + para_string 

    return saved_model_name


def save_upload(cvModel, objetivo, model_name, hyperparams,bucket_name = "models-dpa"):
    trained_model = cvModel.stages[-1]
    
    saved_model_name = parse_filename(objetivo, model_name, hyperparams) + ".model"
    key_name = saved_model_name[2:]
    
    # Save model
    trained_model.save(saved_model_name)

    # Zip model 
    zip_model(key_name)
    
    new_saved_model = saved_model_name +".zip"
    new_key_name = new_saved_model[2:] 

    # Upload file
    upload_file_to_bucket(new_saved_model, bucket_name, new_key_name)
        

In [33]:
save_and_zip(cvModel, objetivo, model_name, hyperparams)

In [34]:
saved_model_name = parse_filename(objetivo, model_name, hyperparams)
saved_model_name

'./22042020_cancelled_LR_=#pca#-%1$%#iter#-%1&.model'

## Upload to S3

In [21]:
import boto3
from src import (
    BUCKET,
    MY_REGION,
    MY_REGION2,
    MY_PROFILE,
    MY_KEY,
    MY_AMI ,
    MY_VPC ,
    MY_GATEWAY,
    MY_SUBNET,
    MY_GROUP
)


ses = boto3.session.Session(profile_name=MY_PROFILE, region_name=MY_REGION,)
s3 = ses.resource('s3')
bucket_name = BUCKET

In [54]:
bucket_name = "models-dpa"
#my_bucket = s3_resource.Bucket(bucket_name)

model_dir = parse_filename(objetivo, model_name, hyperparams)
key_name = saved_model_name[2:]

zip_model(key_name)

22042020_cancelled_LR_=#pca#-%1$%#iter#-%1&.model.zip file is created successfully!


In [53]:
import os
import zipfile
 
#Declare the function to return all file paths of the particular directory
def retrieve_file_paths(dirName):
    # setup file paths variable
    filePaths = []
    # Read all directory, subdirectories and file lists
    for root, directories, files in os.walk(dirName):
        for filename in files:
            # Create the full filepath by using os module.
            filePath = os.path.join(root, filename)
            filePaths.append(filePath)
    # return all paths
    return filePaths
  
def zip_model(dir_name):
    #https://linuxhint.com/python_zip_file_directory/
    # Call the function to retrieve all files and folders of the assigned directory
    filePaths = retrieve_file_paths(dir_name)
    # writing files to a zipfile
    zip_file = zipfile.ZipFile(dir_name+'.zip', 'w')
    with zip_file:
        # writing each file one by one
        for file in filePaths:
            zip_file.write(file)
    print(dir_name+'.zip file is created successfully!')

In [55]:
bucket_name = "models-dpa"
#my_bucket = s3_resource.Bucket(bucket_name)

model_dir = parse_filename(objetivo, model_name, hyperparams) +".zip"
key_name = saved_model_name[2:] 

ses = boto3.session.Session(profile_name=MY_PROFILE, region_name=MY_REGION,)
s3 = ses.resource('s3')

# Escribimos el archivo al bucket, usando el binario
s3.meta.client.upload_file(model_dir, bucket_name, key_name)

In [113]:
trained_model.save(saved_model_name)

In [114]:
from pyspark.ml.classification import RandomForestClassificationModel, LogisticRegressionModel
import os 

prueba = LogisticRegressionModel.load(saved_model_name)

In [115]:
prueba

LogisticRegressionModel: uid = LogisticRegression_2e27ecb108eb, numClasses = 2, numFeatures = 1

In [80]:
CURRENT_DIR 

'/home/jovyan/work/notebooks'

In [7]:
from datetime import date

In [9]:
# dd/mm/YY
today = date.today()
d1 = today.strftime("%d%m%Y")
print("d1 =", d1)

d1 = 22042020
