In [1]:
!refit_init

Refit Installed, You can now run the following to import REFIT
from refit import Refit


In [1]:
#Example Notebook
from sklearn.datasets import load_iris
from datetime import datetime
from refit import Refit
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from refit.enums.model_format import ModelFormat
from skl2onnx.common.data_types import FloatTensorType, BooleanTensorType
from feature_extractor import FeatureExtractor

In [2]:
project_guid = "b6ee5bab-08dd-49b0-98b6-45cd0a28b12f"
refit = Refit(project_guid)

start = datetime(2020, 6, 26)
end = datetime(2020, 6, 30)
sensors = list(
    map(lambda x: str(x),
    [5163] + list(range(8620, 8625)))) #['5163']

sensors

['5163', '8620', '8621', '8622', '8623', '8624']

In [3]:
# Getting Sensor Data with the training window flag incorporated
df = refit.sensor_data(
    start=start, 
    end=end, 
    sensors=sensors, 
    feature_extractor=FeatureExtractor(), 
    include_flag=False
)
df

Unnamed: 0,sensorid,pressure,timestamp,temperature,wind
0,8620,1128.0,2020-06-28 17:10:22,65.0,21.0
1,8620,1044.0,2020-06-28 17:10:27,58.0,63.0
2,8620,961.0,2020-06-28 17:10:32,66.0,58.0
3,8620,1199.0,2020-06-28 17:10:37,31.0,20.0
4,8620,953.0,2020-06-28 17:10:42,78.0,32.0
...,...,...,...,...,...
95,8624,923.0,2020-06-28 13:23:02,69.0,64.0
96,8624,1022.0,2020-06-28 13:23:07,57.0,66.0
97,8624,994.0,2020-06-28 13:23:12,70.0,30.0
98,8624,1196.0,2020-06-28 13:23:17,74.0,43.0


In [4]:
skip_columns = ['project_guid', 'sensor_id', 'partition_key', 'timestamp', 'operable', 'data', 'prediction']
feature_columns = list(filter( lambda x: x not in skip_columns, df.columns.array))
feature_columns

['sensorid', 'pressure', 'temperature', 'wind']

In [5]:
df['temperature'] = df['temperature'].astype(float)
df['pressure'] = df['pressure'].astype(float)
df['wind'] = df['wind'].astype(float)

df['operable'] = (df.temperature < 75.0)
df['operable'] = df['operable'].astype(int)
df

Unnamed: 0,sensorid,pressure,timestamp,temperature,wind,operable
0,8620,1128.0,2020-06-28 17:10:22,65.0,21.0,1
1,8620,1044.0,2020-06-28 17:10:27,58.0,63.0,1
2,8620,961.0,2020-06-28 17:10:32,66.0,58.0,1
3,8620,1199.0,2020-06-28 17:10:37,31.0,20.0,1
4,8620,953.0,2020-06-28 17:10:42,78.0,32.0,0
...,...,...,...,...,...,...
95,8624,923.0,2020-06-28 13:23:02,69.0,64.0,1
96,8624,1022.0,2020-06-28 13:23:07,57.0,66.0,1
97,8624,994.0,2020-06-28 13:23:12,70.0,30.0,1
98,8624,1196.0,2020-06-28 13:23:17,74.0,43.0,1


In [6]:
x = df[feature_columns]
y = df['operable']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 70% training and 30% test

In [8]:
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)

RandomForestClassifier()

In [9]:
y_pred=clf.predict(X_test)
y_pred

array([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1])

In [10]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [11]:
initial_types = [('input', FloatTensorType([None,len(feature_columns)]))]

In [15]:
refit.save(clf, feature_columns, ModelFormat.SK_LEARN, initial_types=initial_types)

<Response [200]>