In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer

from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import auxiliary
from auxiliary import form_sequences_dynamic

In [5]:
'''LOAD DATASET'''
print("Loading Dataset")
df = pd.read_csv('merged_dataset_final_correlation.csv')
print(df['imeisv_ue'].unique())

print(df.columns)

feature_columns = ['total_dl_bytes_ue', 'total_ul_bytes_ue', 'cell_X_cqi_ue', 'cell_X_dl_bitrate_ue',
                   'cell_X_dl_mcs_ue', 'cell_X_dl_retx_ue', 'cell_X_dl_tx_ue', 'cell_X_epre_ue',
                   'cell_X_initial_ta_ue', 'cell_X_p_ue_ue','cell_X_pusch_snr_ue', 'cell_X_turbo_decoder_avg_ue',
                   'cell_X_turbo_decoder_max_ue', 'cell_X_turbo_decoder_min_ue', 'cell_X_ul_bitrate_ue',
                   'cell_X_ul_path_loss_ue', 'cell_X_ul_phr_ue', 'cell_X_ul_retx_ue', 'cell_X_ul_tx_ue',
                   'cell_X_dl_bitrate_cell', 'cell_X_dl_retx_cell', 'cell_X_dl_sched_users_avg_cell',
                   'cell_X_dl_sched_users_max_cell', 'cell_X_dl_tx_cell', 'cell_X_ue_active_count_avg_cell',
                   'cell_X_ul_bitrate_cell', 'cell_X_ul_retx_cell', 'duration_cell',
                   'msg_ng_initial_context_setup_request_cell',
                   'msg_ng_initial_context_setup_response_cell',
                   'msg_ng_initial_ue_message_cell',
                   'msg_ng_pdu_session_resource_release_command_cell',
                   'msg_ng_pdu_session_resource_release_response_cell',
                   'msg_ng_pdu_session_resource_setup_request_cell',
                   'msg_ng_pdu_session_resource_setup_response_cell',
                   'msg_ng_ue_context_release_command_cell',
                   'msg_ng_ue_context_release_complete_cell',
                   'msg_ng_ue_context_release_request_cell',
                   'msg_ng_uplink_nas_transport_cell', 'msg_xn_setup_request_recv_cell',
                   'msg_xn_setup_request_sent_cell', 'rf_rxtx_delay_avg_cell',
                   'rf_samples_rx1_count_cell', 'rf_samples_tx1_max_cell',
                   'rf_samples_tx1_rms_cell']

Loading Dataset
[3557821101183501 8609960468879056 8609960480666910 8609960480859056
 8628490433231156 8628490443809956 8642840401594200 8642840401612300
 8642840401624200 8677660403123800]
Index(['imeisv_ue', 'cell_X_cqi_ue', 'cell_X_dl_bitrate_ue',
       'cell_X_dl_mcs_ue', 'cell_X_dl_retx_ue', 'cell_X_dl_tx_ue',
       'cell_X_epre_ue', 'cell_X_initial_ta_ue', 'cell_X_p_ue_ue',
       'cell_X_pusch_snr_ue', 'cell_X_turbo_decoder_avg_ue',
       'cell_X_turbo_decoder_max_ue', 'cell_X_turbo_decoder_min_ue',
       'cell_X_ul_bitrate_ue', 'cell_X_ul_path_loss_ue', 'cell_X_ul_phr_ue',
       'cell_X_ul_retx_ue', 'cell_X_ul_tx_ue', 'cell_id',
       'cell_X_dl_bitrate_cell', 'cell_X_dl_retx_cell',
       'cell_X_dl_sched_users_avg_cell', 'cell_X_dl_sched_users_max_cell',
       'cell_X_dl_tx_cell', 'cell_X_ue_active_count_avg_cell',
       'cell_X_ul_bitrate_cell', 'cell_X_ul_retx_cell', 'duration_cell',
       'msg_ng_initial_context_setup_request_cell',
       'msg_ng_initial_context_

In [6]:
"""CREATE SEQUENCES OF CONSECUTIVE INSTANCES"""
seq_len = 6
print("Create Sequences with length: ", seq_len)

sequences_all = []
labels_all = []
data = df.copy()
for _, group in data.groupby(['imeisv_ue', 'period_start', 'period_end']):
    sequences, labels, timestamps = form_sequences_dynamic(group, seq_len, feature_columns)
    sequences_all.extend(sequences)
    labels_all.extend(labels)
print("Number of sequence created for training and test data: ", len(sequences_all))

Create Sequences with length:  6
Number of sequence created for training and test data:  148879


In [7]:
# calculate rate of change in the sequences
if seq_len == 1:
    data_complete = np.mean(sequences_all, axis=1)
else:
    roc_sequences = auxiliary.calculate_rate_of_change(sequences_all, percentage=False)
    # Average rate of change
    roc_avg = np.mean(roc_sequences, axis=1)

    # Sequence average
    sequence_avg = np.mean(sequences_all, axis=1)

    # Combine arrays horizontally
    data_complete = np.hstack((sequence_avg[:, 2:],  roc_avg[:, :2]))

# Train val test split
X_train, X_test, y_train, y_test = train_test_split(data_complete, labels_all, test_size=0.2, random_state=42, stratify=labels_all)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

# Scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

'''Class imbalance/resampling'''
smote = SMOTE(sampling_strategy={1: 5000}, random_state=42)
smote_tomek = SMOTETomek(smote=smote, random_state=0)
print(sorted(Counter(y_train).items()))
X_train_smotetomek, y_train_smotetomek = smote_tomek.fit_resample(X_train_scaled, y_train)
print(sorted(Counter(y_train_smotetomek).items()))

Training set shape: (119103, 45)
Test set shape: (29776, 45)
[(0, 118151), (1, 952)]
[(0, 118147), (1, 4996)]


In [10]:
"""GRID SEARCH CV DECISION TREE"""
# Define the parameter grid for the decision tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 15, 20, 25, 30, 40, 45],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Define the decision tree classifier
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)

# Define the scoring method
scorer = make_scorer(f1_score)

# Set up the grid search
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, scoring=scorer, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train a decision tree classifier with the best parameters
best_dt = grid_search.best_estimator_

# Predict on the test set
y_pred = best_dt.predict(X_test_scaled)

# Evaluate the performance
f1 = f1_score(y_test, y_pred)
print("F1-score of the best model: ", f1)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


720 fits failed out of a total of 2880.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
455 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sc02449\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sc02449\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\sc02449\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\sc02449\AppData\Local\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", l

Best parameters found:  {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
F1-score of the best model:  0.976545842217484


In [12]:
''' GRID SEARCH CV RANDOM FOREST '''
# Define the parameter grid for the random forest
param_grid_rf = {
    'n_estimators': [20, 40, 80, 100, 200],
    'max_depth': [None, 10, 15, 20, 25, 30, 40, 45],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Define the random forest classifier
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Define the scoring method
scorer = make_scorer(f1_score)

# Set up the grid search with verbose=2 to see the progress
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, scoring=scorer, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search_rf.fit(X_train_scaled, y_train)

# Get the best parameters
best_params_rf = grid_search_rf.best_params_
print("Best parameters found for Random Forest: ", best_params_rf)

# Train a random forest classifier with the best parameters
best_rf = grid_search_rf.best_estimator_

# Predict on the test set
y_pred_rf = best_rf.predict(X_test_scaled)

# Evaluate the performance
f1_rf = f1_score(y_test, y_pred_rf)
print("F1-score of the best Random Forest model: ", f1_rf)

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
Best parameters found for Random Forest:  {'max_depth': 45, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 200}
F1-score of the best Random Forest model:  0.9895615866388309


In [14]:
# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [20, 40, 80, 100, 200],
    'max_depth': [None, 8, 12, 16, 20, 30 ,40],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Define the XGBoost classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Define the scoring method
scorer = make_scorer(f1_score)

# Set up the grid search with verbose=2 to see the progress
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, scoring=scorer, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search_xgb.fit(X_train_scaled, y_train)

# Get the best parameters
best_params_xgb = grid_search_xgb.best_params_
print("Best parameters found for XGBoost: ", best_params_xgb)

# Train an XGBoost classifier with the best parameters
best_xgb = grid_search_xgb.best_estimator_

# Predict on the test set
y_pred_xgb = best_xgb.predict(X_test_scaled)

# Evaluate the performance
f1_xgb = f1_score(y_test, y_pred_xgb)
print("F1-score of the best XGBoost model: ", f1_xgb)

Fitting 5 folds for each of 945 candidates, totalling 4725 fits
Best parameters found for XGBoost:  {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 16, 'n_estimators': 200, 'subsample': 1.0}
F1-score of the best XGBoost model:  0.9916317991631799


In [17]:
df[df['imeisv_ue']==3557821101183501].head(10)

Unnamed: 0,imeisv_ue,cell_X_cqi_ue,cell_X_dl_bitrate_ue,cell_X_dl_mcs_ue,cell_X_dl_retx_ue,cell_X_dl_tx_ue,cell_X_epre_ue,cell_X_initial_ta_ue,cell_X_p_ue_ue,cell_X_pusch_snr_ue,...,rf_samples_tx1_rms_cell,total_dl_bytes_ue,total_ul_bytes_ue,period_index,chunk_index,chunk_start,chunk_end,period_start,period_end,label
0,3557821101183501,14.0,5887.0,21.8,3.0,20.0,-111.1,5.0,-17.0,8.4,...,-36.212135,59239931.0,37562931.0,0,0,2024-01-24 14:00:00+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
1,3557821101183501,15.0,5718.0,21.8,6.0,20.0,-112.2,5.0,-18.0,6.7,...,-36.614845,59240351.0,37563351.0,0,1,2024-01-24 14:00:05+00:00,2024-01-24 14:00:05+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
2,3557821101183501,14.0,5492.0,21.7,5.0,20.0,-112.0,5.0,-18.0,6.8,...,-36.659306,59240771.0,37563771.0,0,2,2024-01-24 14:00:10+00:00,2024-01-24 14:00:10+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
3,3557821101183501,14.0,6091.0,22.0,6.0,21.0,-111.6,5.0,-18.0,7.3,...,-36.667698,59241191.0,37564191.0,0,3,2024-01-24 14:00:15+00:00,2024-01-24 14:00:15+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
4,3557821101183501,14.0,5765.0,21.9,5.0,20.0,-110.0,5.0,-18.0,9.1,...,-36.692703,59241611.0,37564611.0,0,4,2024-01-24 14:00:20+00:00,2024-01-24 14:00:20+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
5,3557821101183501,14.0,5678.0,21.6,7.0,20.0,-110.5,5.0,-18.0,9.2,...,-36.165726,59242031.0,37565031.0,0,5,2024-01-24 14:00:25+00:00,2024-01-24 14:00:25+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
6,3557821101183501,14.0,6355.0,22.0,7.0,22.0,-112.8,5.0,-18.0,5.0,...,-35.837372,59242451.0,37565535.0,0,6,2024-01-24 14:00:30+00:00,2024-01-24 14:00:30+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
7,3557821101183501,15.0,5908.0,21.5,6.0,21.0,-110.8,5.0,-18.0,8.9,...,-36.6716,59242955.0,37565955.0,0,7,2024-01-24 14:00:35+00:00,2024-01-24 14:00:35+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
8,3557821101183501,14.0,6478.0,21.2,4.0,24.0,-111.9,5.0,-18.0,6.3,...,-36.672909,59243467.0,37566475.0,0,8,2024-01-24 14:00:40+00:00,2024-01-24 14:00:40+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0
9,3557821101183501,15.0,5649.0,21.7,3.0,20.0,-112.5,5.0,-18.0,6.9,...,-36.624199,59243887.0,37566895.0,0,9,2024-01-24 14:00:45+00:00,2024-01-24 14:00:45+00:00,2024-01-24 14:00:00+00:00,2024-01-24 14:01:25+00:00,0


In [23]:
sample_df = df[(df['imeisv_ue'] == 3557821101183501) & (df['period_index'] == 0)]

In [25]:
sample_df.to_csv('sample_df.csv')