In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.metrics import get_scorer
from sklearn.utils import compute_sample_weight

from lightgbm.sklearn import LGBMClassifier
import xgboost as xgb

from helper_func import train_pre_data, DFToDictTransformer


In [2]:
# experiment_metadata = dict(
#     prediction_column='y_track1',
#     holdout_size=0.1,
#     scoring='roc_auc',
#     random_state=33,
#     include_only_estimators=['RandomForestClassifierEstimator', 'DecisionTreeClassifierEstimator', 'LogisticRegressionEstimator', 'ExtraTreesClassifierEstimator', 'XGBClassifierEstimator', 'LGBMClassifierEstimator', 'SnapDecisionTreeClassifierEstimator', 'SnapRandomForestClassifierEstimator', 'SnapLogisticRegressionEstimator', 'SnapSVMClassifierEstimator', 'GradientBoostingClassifierEstimator'],
#     train_sample_columns_index_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
#     positive_label='1.0',
# )

RSSI_value_selection = "Average" #@param ["RSSI_Left","RSSI_Right","Min","Max","Average"]
window_size = 360 #@param {type:"integer"}
window_stride = 360 #@param {type:"integer"}


In [3]:
data = pd.read_csv('../../data/mafat_wifi_challenge_training_set_v1.csv', low_memory=False)
data_train_x, data_train_y, raw_x = train_pre_data(data, RSSI_value_selection, window_size, window_stride)

data_train_y_track1 = data_train_y.copy()
data_train_y_track1.loc[data_train_y>0] = 1

In [4]:
data_train_x.columns


Index(['max_RSSI', 'std_RSSI', 'skew_RSSI', 'max_sub_min_RSSI',
       'max_RSSI_diffs', 'min_RSSI_diffs', 'mean_RSSI_diffs', 'std_RSSI_diffs',
       'skew_RSSI_diffs', 'max_sub_min_RSSI_diffs', 'mean_RSSI_diffs_abs',
       'median_RSSI_diffs_abs', 'std_RSSI_diffs_abs',
       'max_sub_min_RSSI_diffs_abs', 'max_RSSI_median_dist',
       'mean_RSSI_median_dist', 'median_RSSI_median_dist',
       'std_RSSI_median_dist', 'max_sub_min_RSSI_median_dist',
       'max_count_same_value_RSSI', 'RSSI_peaks', 'RSSI_diffs_peaks',
       'peak_ratio_diffs_RSSI', 'RSSI_values_count'],
      dtype='object')

In [5]:
train_X, test_X, train_y, test_y = train_test_split(data_train_x, data_train_y_track1, test_size=0.25, random_state=42)

In [6]:
ct = make_column_transformer(
      (StandardScaler(), make_column_selector(dtype_include=np.number)))

In [7]:

# union = make_union(pipeline_0, pipeline_1)
# numpy_permute_array = NumpyPermuteArray(
#     axis=0,
#     permutation_indices=[
#         0, 3, 4, 5, 9, 11, 13, 18, 20, 21, 23, 1, 2, 6, 7, 8, 10, 12, 14, 15,
#         16, 17, 19, 22,
#     ],
# )
# ta1 = TA1(
#     fun=np.square,
#     name="square",
#     datatypes=["numeric"],
#     feat_constraints=[autoai_libs.utils.fc_methods.is_not_categorical],
#     col_names=[
#         "max_RSSI", "std_RSSI", "skew_RSSI", "max_sub_min_RSSI",
#         "max_RSSI_diffs", "min_RSSI_diffs", "mean_RSSI_diffs",
#         "std_RSSI_diffs", "skew_RSSI_diffs", "max_sub_min_RSSI_diffs",
#         "mean_RSSI_diffs_abs", "median_RSSI_diffs_abs", "std_RSSI_diffs_abs",
#         "max_sub_min_RSSI_diffs_abs", "max_RSSI_median_dist",
#         "mean_RSSI_median_dist", "median_RSSI_median_dist",
#         "std_RSSI_median_dist", "max_sub_min_RSSI_median_dist",
#         "max_count_same_value_RSSI", "RSSI_peaks", "RSSI_diffs_peaks",
#         "peak_ratio_diffs_RSSI", "RSSI_values_count",
#     ],
#     col_dtypes=[
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#     ],
# )
# fs1_0 = FS1(
#     cols_ids_must_keep=range(0, 24),
#     additional_col_count_to_keep=20,
#     ptype="classification",
# )
# t_gen = TGen(
#     fun=autoai_libs.cognito.transforms.transform_extras.NXOR,
#     name="nxor",
#     arg_count=2,
#     datatypes_list=[["numeric"], ["numeric"]],
#     feat_constraints_list=[
#         [autoai_libs.utils.fc_methods.is_not_categorical],
#         [autoai_libs.utils.fc_methods.is_not_categorical],
#     ],
#     col_names=[
#         "max_RSSI", "std_RSSI", "skew_RSSI", "max_sub_min_RSSI",
#         "max_RSSI_diffs", "min_RSSI_diffs", "mean_RSSI_diffs",
#         "std_RSSI_diffs", "skew_RSSI_diffs", "max_sub_min_RSSI_diffs",
#         "mean_RSSI_diffs_abs", "median_RSSI_diffs_abs", "std_RSSI_diffs_abs",
#         "max_sub_min_RSSI_diffs_abs", "max_RSSI_median_dist",
#         "mean_RSSI_median_dist", "median_RSSI_median_dist",
#         "std_RSSI_median_dist", "max_sub_min_RSSI_median_dist",
#         "max_count_same_value_RSSI", "RSSI_peaks", "RSSI_diffs_peaks",
#         "peak_ratio_diffs_RSSI", "RSSI_values_count", "square(max_RSSI)",
#         "square(std_RSSI)", "square(skew_RSSI)", "square(max_sub_min_RSSI)",
#         "square(max_RSSI_diffs)", "square(mean_RSSI_diffs)",
#         "square(std_RSSI_diffs)", "square(skew_RSSI_diffs)",
#         "square(max_sub_min_RSSI_diffs)", "square(mean_RSSI_diffs_abs)",
#         "square(max_sub_min_RSSI_diffs_abs)", "square(max_RSSI_median_dist)",
#         "square(mean_RSSI_median_dist)", "square(std_RSSI_median_dist)",
#         "square(max_sub_min_RSSI_median_dist)",
#         "square(max_count_same_value_RSSI)", "square(RSSI_peaks)",
#         "square(RSSI_diffs_peaks)", "square(peak_ratio_diffs_RSSI)",
#         "square(RSSI_values_count)",
#     ],
#     col_dtypes=[
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"),
#     ],
# )
# fs1_1 = FS1(
#     cols_ids_must_keep=range(0, 24),
#     additional_col_count_to_keep=20,
#     ptype="classification",
# )
lgbm_classifier = LGBMClassifier(
    class_weight="balanced", n_estimators=100, random_state=33
)

xgbc = xgb.XGBClassifier()



In [8]:
dict_vec_transformer = DV(sparse=False)
df_to_dict_transformer = DFToDictTransformer()



In [9]:
pipeline = make_pipeline(
    # ct, 
    # dict_vec_transformer,
    # lgbm_classifier,
    xgbc
)

In [10]:
scorer = get_scorer('roc_auc')

In [13]:
sample_weight = compute_sample_weight('balanced', train_y)

In [23]:
# pipeline.fit(train_X.to_dict('records'), train_y.values.ravel())


pipeline.fit(train_X, train_y, xgbclassifier__sample_weight=sample_weight)
# pipeline.fit(train_X, train_y)

Pipeline(steps=[('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=0, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=6, max_leaves=0, min_child_weight=1,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                               predictor='auto', random_state=0, reg_alpha=0,
                               reg_lambda=1, ...))])

In [24]:
score = scorer(pipeline, test_X.values, test_y.values)
print(score)

0.8273572453481534


In [22]:
pipeline.predict(test_X.values)

array([0, 1, 1, ..., 0, 1, 1])

<a id="summary_and_next_steps"></a>
# Summary and next steps
You successfully completed this notebook!
You learned how to use AutoAI pipeline definition to train the model.
Check out our [Online Documentation](https://www.ibm.com/cloud/watson-studio/autoai) for more samples, tutorials, documentation, how-tos, and blog posts.

In [None]:
import joblib

In [None]:
model_filename = os.path.join(os.getcwd(), 'mafat_processed.joblib')
_ = joblib.dump(pipeline, model_filename)
print(f"New model has been written to {model_filename}")

In [None]:
from ibm_botocore.client import Config
import ibm_boto3

def upload_file_cos(credentials,local_file_name,key):  
    cos = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=credentials['API_KEY_ID'],
    ibm_auth_endpoint=credentials['AUTH_ENDPOINT'],
    ibm_service_instance_id=credentials['INSTANCE_ID'],
    config=Config(signature_version='oauth'),
    endpoint_url=credentials['COS_ENDPOINT'])

    print("Connected. Uploading file: " + key)
    a=cos.list_objects(Bucket=credentials['BUCKET'], Prefix='')#['Contents']
    # print(a)
    res=cos.upload_file(Filename=local_file_name, Bucket=credentials['BUCKET'],Key=key)
    # res=cos.upload_file(Filename=local_file_name, Bucket=credentials['BUCKET'],Key='daily.json')
    print('File Uploaded')        
    return True


In [None]:
credentials = {
    'API_KEY_ID': os.getenv('COS_API_KEY_ID'),
    'INSTANCE_ID': os.getenv('COS_CRN_SERVICE_ID'),
    'COS_ENDPOINT': os.getenv('COS_ENDPOINT'),
    'AUTH_ENDPOINT': 'https://iam.cloud.ibm.com/oidc/token',
    'BUCKET': os.getenv('COS_BUCKET')
}
# changed

upload_file_cos(credentials, model_filename, 'mafat_processed.joblib')


<a id="copyrights"></a>
### Copyrights

Licensed Materials - Copyright © 2022 IBM. This notebook and its source code are released under the terms of the ILAN License. Use, duplication disclosure restricted by GSA ADP Schedule Contract with IBM Corp.

**Note:** The auto-generated notebooks are subject to the International License Agreement for Non-Warranted Programs (or equivalent) and License Information document for Watson Studio Auto-generated Notebook (License Terms), such agreements located in the link below. Specifically, the Source Components and Sample Materials clause included in the License Information document for Watson Studio Auto-generated Notebook applies to the auto-generated notebooks.  

By downloading, copying, accessing, or otherwise using the materials, you agree to the <a href="http://www14.software.ibm.com/cgi-bin/weblap/lap.pl?li_formnum=L-AMCU-BYC7LF">License Terms</a>

___