# PFC - ETL and Model Training Notebook - Rafael S. de Almeida

## Download python packages and import libraries

In [1]:
# %matplotlib inline
# %pip install psycopg2-binary

In [2]:
import numpy as np
import requests
import psycopg2
import pandas
import json
from scipy import spatial
from sklearn.linear_model import LogisticRegression
import joblib 

  """)


## Call lightning forecast API

In [3]:
response = requests.get("<INPE-API-URL>")
lightningOccurrences = pandas.read_json(response.text)
# print (lightningOccurrences[['latitude','longitude']])

## Connect to DB and select towers info

In [4]:
PORT=              ""
DATABASE_USER=     ""
DATABASE_PASSWORD= ""
DATABASE_HOST=     ""
DATABASE_NAME=     ""
DATABASE_PORT=     ""

In [5]:
con = psycopg2.connect(host=DATABASE_HOST, database=DATABASE_NAME,user=DATABASE_USER, password=DATABASE_PASSWORD,
                       port=DATABASE_PORT)
cur = con.cursor()
sql = 'SELECT "ds_linha_transmissao", "coord_y", "coord_x" FROM "torres_completas" WHERE "coord_y" IS NOT NULL OR "coord_x" IS NOT NULL'
cur.execute(sql)
results = cur.fetchall()
towers = pandas.DataFrame (results,columns=['linha','latitude','longitude'])

## Define closest tower to each lightning and calculate the distance

In [6]:
tree = spatial.KDTree(towers[['latitude','longitude']])
distances = tree.query(lightningOccurrences[['latitude','longitude']])
distances = pandas.DataFrame(list(distances)).T
distances.columns = ['distance','towerIndex']

## Verify electrical overcurrent

In [7]:
results = []
for index, row in distances.iterrows():
    if row['distance'] <= 0.0005 and abs(int(lightningOccurrences[['corrente']].iloc[index])) >= 32.9:
        results.append(1)
    else:
        results.append(0)
results = pandas.DataFrame(list(results))

## Create training dataset

In [8]:
dataset = pandas.DataFrame({
    'distance': distances['distance'],
    'current' : lightningOccurrences['corrente'],
    'results' : results[0]
})

dataset

Unnamed: 0,distance,current,results
0,0.000382,45,1
1,0.000134,5,0
2,0.000342,45,1
3,0.000466,5,0
4,0.000127,45,1
...,...,...,...
324,0.097551,10,0
325,0.891642,4,0
326,0.273662,-5,0
327,1.862817,6,0


In [9]:
dataset.to_csv('train.csv')

import boto3
bucket = 'pfc-bucket'
region = 'sa-east-1'
s3_session = boto3.Session().resource('s3')
# s3_session.create_bucket(Bucket=bucket, 
#                          CreateBucketConfiguration=
#                          {'LocationConstraint': region})
s3_session.Bucket(bucket).Object('train/train.csv').upload_file('train.csv')

## Train model using Linear Regression

In [10]:
features = dataset.to_numpy()[:,:2]
features[:,0] = features[:,0] * 100000
result = dataset.to_numpy()[:,2]

reg = LogisticRegression().fit(features,result)

In [11]:
reg.score(features,result)

1.0

In [12]:
features

array([[ 3.82184622e+01,  4.50000000e+01],
       [ 1.33635059e+01,  5.00000000e+00],
       [ 3.42165002e+01,  4.50000000e+01],
       [ 4.65904695e+01,  5.00000000e+00],
       [ 1.26856929e+01,  4.50000000e+01],
       [ 3.71400217e+01,  5.00000000e+00],
       [ 4.47275223e+01,  4.50000000e+01],
       [ 3.47116219e+01,  5.00000000e+00],
       [ 2.56187749e+01,  4.50000000e+01],
       [ 0.00000000e+00,  5.00000000e+00],
       [ 0.00000000e+00,  4.50000000e+01],
       [ 0.00000000e+00,  5.00000000e+00],
       [ 4.53929730e+00,  4.50000000e+01],
       [ 0.00000000e+00,  5.00000000e+00],
       [ 0.00000000e+00,  4.50000000e+01],
       [ 0.00000000e+00,  5.00000000e+00],
       [ 0.00000000e+00,  4.50000000e+01],
       [ 0.00000000e+00,  5.00000000e+00],
       [ 0.00000000e+00,  4.50000000e+01],
       [ 0.00000000e+00,  5.00000000e+00],
       [ 0.00000000e+00,  4.50000000e+01],
       [ 0.00000000e+00,  5.00000000e+00],
       [ 0.00000000e+00,  4.50000000e+01],
       [ 0.

In [13]:
reg.coef_

array([[-0.3471059,  0.6101031]])

In [14]:
reg.intercept_

array([-9.31236859])

## Save the model

In [15]:
joblib.dump(reg, 'model.pkl')

['model.pkl']

## Prediction Tests

In [16]:
# Descarga em cima da torre (positivo)

reg.predict(np.array([[0.0, 59]]))

array([1.])

In [17]:
# Descarga a 47.9m de 59kA (positivo)

reg.predict(np.array([[47.9, 59]]))

array([1.])

In [18]:
# Descarga no limiar (50m de distância, 32.9kA de corrente)

reg.predict(np.array([[50, 32.9]]))

# Considerar 0.57066839 como o limite para definir entre resultados positivos e negativos

array([0.])

In [19]:
# Descarga a 60.9m de 32.9kA (negativo)

reg.predict(np.array([[60.9, 32.9]]))

array([0.])

In [20]:
# Descarga a 30m de 30kA (negativo)

reg.predict(np.array([[30, 30]]))

array([0.])

In [21]:
reg.predict(np.array([[375, 32.9]]))

array([0.])

In [22]:
# Descarga a 10000m de 100kA (negativo)

reg.predict(np.array([[10000, 100]]))

# Isso é uma anomalia. DISCUTIR COM O FERNANDO

array([0.])

## Model Deployment

In [28]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role
import sagemaker

sagemaker_session = sagemaker.Session()
role = get_execution_role()

# role = 'AmazonSageMaker-ExecutionRole-20200923T223932'

# Create the SKLearn Object by directing it to the aws_sklearn_main.py script
aws_sklearn = SKLearn(entry_point='train.py',
                      train_instance_type='ml.m4.xlarge',
                      role=role,
                      source_dir='sources',
                      output_path='s3://pfc-bucket/train')


This is not the latest supported version. If you would like to use version 0.23-1, please add framework_version=0.23-1 to your constructor.


In [30]:

# Train the model using by passing the path to the S3 bucket with the training data
aws_sklearn.fit({'train': 's3://pfc-bucket/train'})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-10-07 02:43:19 Starting - Starting the training job...
2020-10-07 02:43:21 Starting - Launching requested ML instances......
2020-10-07 02:44:23 Starting - Preparing the instances for training...
2020-10-07 02:45:17 Downloading - Downloading input data...
2020-10-07 02:45:49 Training - Downloading the training image...
2020-10-07 02:46:20 Uploading - Uploading generated training model[34m2020-10-07 02:46:09,921 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-10-07 02:46:09,924 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-10-07 02:46:09,936 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-10-07 02:46:10,227 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-10-07 02:46:10,227 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-10-07 02:46:10,227 sagemaker-containers 

In [31]:
# Deploy model
aws_sklearn_predictor = aws_sklearn.deploy(instance_type='ml.m4.xlarge', 
                                           initial_instance_count=1)

# Print the endpoint to test in next step
print(aws_sklearn_predictor.endpoint)

# Uncomment and run to terminate the endpoint after you are finished
#predictor.delete_endpoint()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------------!sagemaker-scikit-learn-2020-10-07-02-43-19-460


In [40]:
aws_sklearn_predictor.predict(np.array([[50, 50]]))

array([1.])