In [1]:
import pandas as pd
import numpy as np
import os, joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.metrics import get_scorer
from sklearn.utils import compute_sample_weight

from lightgbm.sklearn import LGBMClassifier
import xgboost as xgb

from helper_func import train_pre_data, DFToDictTransformer

import warnings
warnings.filterwarnings("ignore")


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  from pandas import MultiIndex, Int64Index


In [2]:
# experiment_metadata = dict(
#     prediction_column='y_track1',
#     holdout_size=0.1,
#     scoring='roc_auc',
#     random_state=33,
#     include_only_estimators=['RandomForestClassifierEstimator', 'DecisionTreeClassifierEstimator', 'LogisticRegressionEstimator', 'ExtraTreesClassifierEstimator', 'XGBClassifierEstimator', 'LGBMClassifierEstimator', 'SnapDecisionTreeClassifierEstimator', 'SnapRandomForestClassifierEstimator', 'SnapLogisticRegressionEstimator', 'SnapSVMClassifierEstimator', 'GradientBoostingClassifierEstimator'],
#     train_sample_columns_index_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
#     positive_label='1.0',
# )

RSSI_value_selection = "Average" #@param ["RSSI_Left","RSSI_Right","Min","Max","Average"]
window_size = 360 #@param {type:"integer"}
window_stride = 360 #@param {type:"integer"}
create_validation_set = True
data = pd.read_csv('../../data/mafat_wifi_challenge_training_set_v1.csv', low_memory=False)


In [3]:
if create_validation_set:
    data_val = data[data['Room_Num']==2]
    data.drop(data[data['Room_Num']==2].index,inplace=True)
    data_val_x, data_val_y, raw_val_x = train_pre_data(data_val, RSSI_value_selection, window_size, window_stride)
    data_val_y_track1 = data_val_y.copy()
    data_val_y_track1.loc[data_val_y>0] = 1

data_train_x, data_train_y, raw_train_x = train_pre_data(data, RSSI_value_selection, window_size, window_stride)


data_train_y_track1 = data_train_y.copy()
data_train_y_track1.loc[data_train_y>0] = 1

In [19]:
psd_train_x = pd.DataFrame(raw_train_x['welch_psd'].to_list())
full_train_x = pd.concat([data_train_x, psd_train_x], axis=1)
psd_val_x = pd.DataFrame(raw_val_x['welch_psd'].to_list())
full_val_x = pd.concat([data_val_x, psd_val_x], axis=1)


In [20]:
full_train_x

Unnamed: 0,max_RSSI,std_RSSI,skew_RSSI,max_sub_min_RSSI,max_RSSI_diffs,min_RSSI_diffs,mean_RSSI_diffs,std_RSSI_diffs,skew_RSSI_diffs,max_sub_min_RSSI_diffs,...,119,120,121,122,123,124,125,126,127,128
0,-47,2.170765,0.103749,8,6,-7,-0.011142,1.370151,-0.259331,13,...,0.190392,0.425976,0.616699,0.290657,0.087704,2.641103,1.557546,1.397299,1.292080,0.383345
1,-46,2.282779,0.071451,9,8,-8,0.002786,1.428907,-0.182479,16,...,0.377927,0.103902,1.362534,1.329103,1.492960,0.559410,0.163794,0.434123,0.374232,0.025034
2,-46,2.288011,0.556602,9,8,-8,-0.013928,1.562915,0.206952,16,...,2.169169,4.490885,1.908750,2.193313,4.214770,1.476586,1.102125,2.791901,0.415098,0.316030
3,-46,1.760679,-0.669183,8,5,-5,0.011142,1.180099,0.212353,10,...,0.910645,0.867930,0.449275,0.701611,0.060048,0.308813,0.443623,0.109405,0.734417,0.603340
4,-45,3.135219,-4.529094,43,37,-35,-0.011142,2.917604,0.881796,72,...,9.397167,14.600796,17.817423,20.198586,16.253170,11.802013,18.987401,20.572254,19.691207,8.779420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5103,-43,0.511322,0.020181,3,2,-3,-0.002786,0.190273,-7.638982,5,...,0.017398,0.017396,0.017395,0.017394,0.017393,0.017392,0.017392,0.017391,0.017391,0.008695
5104,-46,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5105,-44,0.105263,18.894517,2,2,-2,0.000000,0.149279,0.000000,4,...,0.004115,0.004115,0.004115,0.004115,0.004115,0.004115,0.004115,0.004115,0.004115,0.002057
5106,-46,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [4]:
# train_X, test_X, train_y, test_y = train_test_split(data_train_x, data_train_y_track1, test_size=0.25, random_state=42)

In [5]:
ct = make_column_transformer(
      (StandardScaler(), make_column_selector(dtype_include=np.number)))

In [6]:


lgbm_classifier = LGBMClassifier(
    class_weight="balanced", n_estimators=100, random_state=33
)

xgbc = xgb.XGBClassifier()



In [7]:
dict_vec_transformer = DV(sparse=False)
df_to_dict_transformer = DFToDictTransformer()
scorer = get_scorer('roc_auc')
sample_weight = compute_sample_weight('balanced', data_train_y_track1)

In [25]:
pipeline = make_pipeline(
    # ct, 
    # dict_vec_transformer,
    lgbm_classifier,
    # xgbc
)

In [28]:
# pipeline.fit(train_X.to_dict('records'), train_y.values.ravel())
# pipeline.fit(train_X, train_y)

# pipeline.fit(data_train_x, data_train_y_track1, xgbclassifier__sample_weight=sample_weight)
# pipeline.fit(data_train_x, data_train_y_track1)
pipeline.fit(data_train_x, data_train_y_track1)

Pipeline(steps=[('lgbmclassifier',
                 LGBMClassifier(class_weight='balanced', random_state=33))])

In [17]:
# score = scorer(pipeline, test_X, test_y)
# print(score)

In [24]:
score = scorer(pipeline, psd_val_x, data_val_y_track1)
print(score)

0.6591415046059366


In [27]:
score = scorer(pipeline, full_val_x, data_val_y_track1)
print(score)

0.7625319856704197


In [29]:
score = scorer(pipeline, data_val_x, data_val_y_track1)
print(score)

0.7589837086318663


In [19]:
# pipeline.predict(test_X)

In [20]:
# model_filename = os.path.join(os.getcwd(), 'mafat_xgb_no_sw.joblib')
# _ = joblib.dump(pipeline, model_filename)
# print(f"New model has been written to {model_filename}")

In [21]:
# union = make_union(pipeline_0, pipeline_1)
# numpy_permute_array = NumpyPermuteArray(
#     axis=0,
#     permutation_indices=[
#         0, 3, 4, 5, 9, 11, 13, 18, 20, 21, 23, 1, 2, 6, 7, 8, 10, 12, 14, 15,
#         16, 17, 19, 22,
#     ],
# )
# ta1 = TA1(
#     fun=np.square,
#     name="square",
#     datatypes=["numeric"],
#     feat_constraints=[autoai_libs.utils.fc_methods.is_not_categorical],
#     col_names=[
#         "max_RSSI", "std_RSSI", "skew_RSSI", "max_sub_min_RSSI",
#         "max_RSSI_diffs", "min_RSSI_diffs", "mean_RSSI_diffs",
#         "std_RSSI_diffs", "skew_RSSI_diffs", "max_sub_min_RSSI_diffs",
#         "mean_RSSI_diffs_abs", "median_RSSI_diffs_abs", "std_RSSI_diffs_abs",
#         "max_sub_min_RSSI_diffs_abs", "max_RSSI_median_dist",
#         "mean_RSSI_median_dist", "median_RSSI_median_dist",
#         "std_RSSI_median_dist", "max_sub_min_RSSI_median_dist",
#         "max_count_same_value_RSSI", "RSSI_peaks", "RSSI_diffs_peaks",
#         "peak_ratio_diffs_RSSI", "RSSI_values_count",
#     ],
#     col_dtypes=[
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#     ],
# )
# fs1_0 = FS1(
#     cols_ids_must_keep=range(0, 24),
#     additional_col_count_to_keep=20,
#     ptype="classification",
# )
# t_gen = TGen(
#     fun=autoai_libs.cognito.transforms.transform_extras.NXOR,
#     name="nxor",
#     arg_count=2,
#     datatypes_list=[["numeric"], ["numeric"]],
#     feat_constraints_list=[
#         [autoai_libs.utils.fc_methods.is_not_categorical],
#         [autoai_libs.utils.fc_methods.is_not_categorical],
#     ],
#     col_names=[
#         "max_RSSI", "std_RSSI", "skew_RSSI", "max_sub_min_RSSI",
#         "max_RSSI_diffs", "min_RSSI_diffs", "mean_RSSI_diffs",
#         "std_RSSI_diffs", "skew_RSSI_diffs", "max_sub_min_RSSI_diffs",
#         "mean_RSSI_diffs_abs", "median_RSSI_diffs_abs", "std_RSSI_diffs_abs",
#         "max_sub_min_RSSI_diffs_abs", "max_RSSI_median_dist",
#         "mean_RSSI_median_dist", "median_RSSI_median_dist",
#         "std_RSSI_median_dist", "max_sub_min_RSSI_median_dist",
#         "max_count_same_value_RSSI", "RSSI_peaks", "RSSI_diffs_peaks",
#         "peak_ratio_diffs_RSSI", "RSSI_values_count", "square(max_RSSI)",
#         "square(std_RSSI)", "square(skew_RSSI)", "square(max_sub_min_RSSI)",
#         "square(max_RSSI_diffs)", "square(mean_RSSI_diffs)",
#         "square(std_RSSI_diffs)", "square(skew_RSSI_diffs)",
#         "square(max_sub_min_RSSI_diffs)", "square(mean_RSSI_diffs_abs)",
#         "square(max_sub_min_RSSI_diffs_abs)", "square(max_RSSI_median_dist)",
#         "square(mean_RSSI_median_dist)", "square(std_RSSI_median_dist)",
#         "square(max_sub_min_RSSI_median_dist)",
#         "square(max_count_same_value_RSSI)", "square(RSSI_peaks)",
#         "square(RSSI_diffs_peaks)", "square(peak_ratio_diffs_RSSI)",
#         "square(RSSI_values_count)",
#     ],
#     col_dtypes=[
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
#         np.dtype("float32"), np.dtype("float32"),
#     ],
# )
# fs1_1 = FS1(
#     cols_ids_must_keep=range(0, 24),
#     additional_col_count_to_keep=20,
#     ptype="classification",
# )