<a href="https://colab.research.google.com/github/Turkcell/ITU-AIMLin5GChallenge-2021/blob/main/TurkcellExampleProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Example Project

Fetch train data from github and unzip them

# Import libraries


In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

# Read Data 

In [2]:
# Example read for RL-KPIS
rl_kpis = pd.read_csv("train/rl-kpis.tsv", sep="\t", index_col=0)
if "datetime" in rl_kpis:
        rl_kpis["datetime"] = pd.to_datetime(rl_kpis["datetime"])
print(f"rl_kpis.shape: {rl_kpis.shape}")
print(rl_kpis.datetime.min(), rl_kpis.datetime.max())
rl_kpis.head()

rl_kpis.shape: (1992986, 18)
2018-12-31 2020-12-25


Unnamed: 0,type,datetime,tip,mlid,mw_connection_no,site_id,card_type,adaptive_modulation,freq_band,severaly_error_second,error_second,unavail_second,avail_time,bbe,rxlevmax,capacity,modulation,rlf
0,ENK,2018-12-31,FAR,A0BE,1349988,RL_;ABDV,cardtype1,Enable,f3,0.0,0.0,0.0,86.4,0.0,-31.0,456.0,1024QAM,False
1,ENK,2018-12-31,FAR,A0BI,1349988,RL_;ABDV,cardtype1,Enable,f3,0.0,0.0,0.0,86.4,0.0,-30.7,456.0,1024QAM,False
2,ENK,2018-12-31,FAR,A5AB,1344018,RL_;ABDV,cardtype4,Enable,f3,0.0,0.0,0.0,86.4,0.0,-34.4,406.0,512QAM,False
3,NEC,2018-12-31,FAR,A8CQ,1351204,RL_;ABDV,cardtype5,Enable,f2,0.0,0.0,0.0,86.4,0.0,-35.3,247.0,2048QAM*,False
4,NEC,2018-12-31,FAR,A8DQ,1351204,RL_;ABDV,cardtype5,Enable,f2,0.0,0.0,0.0,86.4,0.0,-35.3,247.0,2048QAM*,False


# Prepare Labels


In [3]:
# Firstly, only get unique entry identifiers and RLF column.
# For rl-kpis, unique entry identifiers are 'datetime', 'site_id' and 'mlid'.
df_labels = rl_kpis[["datetime", "site_id", "mlid"]]
df_labels.head()

Unnamed: 0,datetime,site_id,mlid
0,2018-12-31,RL_;ABDV,A0BE
1,2018-12-31,RL_;ABDV,A0BI
2,2018-12-31,RL_;ABDV,A5AB
3,2018-12-31,RL_;ABDV,A8CQ
4,2018-12-31,RL_;ABDV,A8DQ


## Prepare target days (prediction days)

In [4]:
#  Prepare columns for the following days. We will join data with these columns to find RLF
prediction_interval = 5

for i in range(prediction_interval):
  df_labels[f"T+{i+1}"] = df_labels["datetime"] + pd.DateOffset(days=i+1)
df_labels.head()

TypeError: unsupported operand type(s) for +: 'DateOffset' and 'str'

## Join dataset to get RLF colunms for the target days

In [None]:
rl_kpis_view = rl_kpis[["datetime", "site_id", "mlid", "rlf"]]
for i in range(prediction_interval):
  target_day_column_name = f"T+{i+1}"

  df_labels = df_labels.merge(rl_kpis_view, 
                  how = "left", 
                  left_on = ("site_id", "mlid", target_day_column_name),
                  right_on = ("site_id", "mlid", "datetime"),
                  suffixes = ("", "_y")
  )
  df_labels.rename(columns={"rlf": f"{target_day_column_name}_rlf"}, inplace=True)
df_labels.drop(columns=["datetime_y"], inplace=True)
df_labels.head()

Unnamed: 0,datetime,site_id,mlid,T+1,T+2,T+3,T+4,T+5,T+1_rlf,T+2_rlf,T+3_rlf,T+4_rlf,T+5_rlf
0,2018-12-31,RL_;ABDV,A0BE,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False
1,2018-12-31,RL_;ABDV,A0BI,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False
2,2018-12-31,RL_;ABDV,A5AB,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False
3,2018-12-31,RL_;ABDV,A8CQ,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False
4,2018-12-31,RL_;ABDV,A8DQ,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False


## Finalize labels for 1-day and 5-day predictions
For each link (site_id, mlid), we found the RLF for the 5-days (T+1.. T+5) following the given day (T).
Now, we will finalize labelling

In [None]:
# 1 day predict is equal to T+1 rlf
df_labels["1-day-predict"] = df_labels["T+1_rlf"]

# Interval predict (5-day predict) is based on T+1, T+2, T+3, T+4 and T+5
following_days_rlf_columns = [f"T+{i+1}_rlf" for i in range(prediction_interval)]

df_labels["5-day-predict"] = df_labels[following_days_rlf_columns].any(axis=1)
df_labels = df_labels[["datetime", "site_id", "mlid", "1-day-predict", "5-day-predict"]]

print(f"df_labels.shape: {df_labels.shape}")
print(f"df_labels 1-day rlf sum: {df_labels['1-day-predict'].sum()}")
print(f"df_labels 5-day rlf sum: {df_labels['5-day-predict'].sum()}")
df_labels.head()

df_labels.shape: (1992986, 5)
df_labels 1-day rlf sum: 1204
df_labels 5-day rlf sum: 5159


Unnamed: 0,datetime,site_id,mlid,1-day-predict,5-day-predict
0,2018-12-31,RL_;ABDV,A0BE,False,False
1,2018-12-31,RL_;ABDV,A0BI,False,False
2,2018-12-31,RL_;ABDV,A5AB,False,False
3,2018-12-31,RL_;ABDV,A8CQ,False,False
4,2018-12-31,RL_;ABDV,A8DQ,False,False


In [None]:
# Now join labels with rl-kpis
rl_kpis_with_labels = rl_kpis.merge(df_labels, 
                                    how="left", 
                                    on=["datetime", "site_id", "mlid"])
rl_kpis_with_labels.head()

Unnamed: 0,type,datetime,tip,mlid,mw_connection_no,site_id,card_type,adaptive_modulation,freq_band,severaly_error_second,error_second,unavail_second,avail_time,bbe,rxlevmax,capacity,modulation,rlf,1-day-predict,5-day-predict
0,ENK,2018-12-31,FAR,A0BE,1349988,RL_;ABDV,cardtype1,Enable,f3,0.0,0.0,0.0,86.4,0.0,-31.0,456.0,1024QAM,False,False,False
1,ENK,2018-12-31,FAR,A0BI,1349988,RL_;ABDV,cardtype1,Enable,f3,0.0,0.0,0.0,86.4,0.0,-30.7,456.0,1024QAM,False,False,False
2,ENK,2018-12-31,FAR,A5AB,1344018,RL_;ABDV,cardtype4,Enable,f3,0.0,0.0,0.0,86.4,0.0,-34.4,406.0,512QAM,False,False,False
3,NEC,2018-12-31,FAR,A8CQ,1351204,RL_;ABDV,cardtype5,Enable,f2,0.0,0.0,0.0,86.4,0.0,-35.3,247.0,2048QAM*,False,False,False
4,NEC,2018-12-31,FAR,A8DQ,1351204,RL_;ABDV,cardtype5,Enable,f2,0.0,0.0,0.0,86.4,0.0,-35.3,247.0,2048QAM*,False,False,False


# Example Model Training
In this example, we will only use rl-kpis and train a simple decision tree model. This is a simplified example.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support

## Future Selection

In [None]:
categorical_features = ["card_type", "freq_band",]
numerical_features = ["severaly_error_second", "error_second", "unavail_second", "bbe", "rxlevmax"]

features = categorical_features + numerical_features

## Under Sampling

In [None]:
# Simple undersampling
np.random.seed(1234)

cond_rlf = rl_kpis_with_labels["5-day-predict"]
rlf_count = cond_rlf.sum()
print("rlf count: ", rlf_count)

# Get sample index from non rlf columns with 1:3 ratio
sampled_non_rlf_indicies = np.random.choice(rl_kpis_with_labels[~cond_rlf].index, size=rlf_count * 2)
rlf_indicies = np.array(rl_kpis_with_labels[cond_rlf].index)

sampled_data_indicies = list(sampled_non_rlf_indicies) + list(rlf_indicies)
sampled_data = rl_kpis_with_labels.loc[sampled_data_indicies]
sampled_data.shape


rlf count:  5159


(15477, 20)

## Test Train Split

In [None]:
df_train, df_test = train_test_split(sampled_data, test_size=0.2)
print(f"df_train.shape: {df_train.shape} | df_test.shape: {df_test.shape}")

df_train.shape: (12381, 20) | df_test.shape: (3096, 20)


## Preprocessing

In [None]:
# Convert categorical columns to one hot vector
# Merge them with numerical columns
# Return X data, column names, and encoder for future usage

def preprocessing(df, numerical_columns=[], categorical_columns=[], one_hot_encoder=None):

  # Handle NA
  # For this simple project, we just remove NA entities.
  df = df.dropna()

  if one_hot_encoder is None:
    print("Creating new one hot encoder")
    # For this project, handle one hot encoding here!
    one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
    one_hot_encoder.fit(df[categorical_columns])
    
    print("one_hot_encoder: ", one_hot_encoder)
    print("*" * 50)
    print("Feature names: ", one_hot_encoder.get_feature_names())
    print("*" * 50)
    print("Categories: ", one_hot_encoder.categories_)

  arr_numerical = df[numerical_columns]
  arr_categorical = one_hot_encoder.transform(df[categorical_columns])
  feature_names = numerical_columns + list(one_hot_encoder.get_feature_names())
  arr_x = np.concatenate((arr_numerical, arr_categorical),axis=1)
  return df.copy(), arr_x, feature_names, one_hot_encoder

# Preprocess data, and get train data
df_train_dropped, train_x, feature_names, one_hot_encoder = preprocessing(df_train, 
                                                                         numerical_columns=numerical_features, 
                                                                         categorical_columns=categorical_features)

Creating new one hot encoder
one_hot_encoder:  OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=False)
**************************************************
Feature names:  ['x0_cardtype1' 'x0_cardtype10' 'x0_cardtype11' 'x0_cardtype2'
 'x0_cardtype4' 'x0_cardtype5' 'x0_cardtype6' 'x1_f1' 'x1_f2' 'x1_f3'
 'x1_f4' 'x1_f5']
**************************************************
Categories:  [array(['cardtype1', 'cardtype10', 'cardtype11', 'cardtype2', 'cardtype4',
       'cardtype5', 'cardtype6'], dtype=object), array(['f1', 'f2', 'f3', 'f4', 'f5'], dtype=object)]


## Build Model

In [None]:
clf_1_day_pred = tree.DecisionTreeClassifier(min_samples_leaf=5,max_depth=4)
clf_5_day_pred = tree.DecisionTreeClassifier(min_samples_leaf=5,max_depth=4)

## Train Models

In [None]:
# train_x is prepared at preprocessing step
train_y_1_day_pred = df_train_dropped["1-day-predict"].astype('int').to_numpy()
train_y_5_day_pred = df_train_dropped["5-day-predict"].astype('int').to_numpy()

In [None]:
_ = clf_1_day_pred.fit(train_x, train_y_1_day_pred)
_= clf_5_day_pred.fit(train_x, train_y_5_day_pred)

## Test Models

In [None]:
# Preprocess test data
df_test_dropepd, test_x, _, _ = preprocessing(df_test, numerical_columns=numerical_features, 
                             categorical_columns=categorical_features, 
                             one_hot_encoder=one_hot_encoder)
test_y_1_day_pred = df_test_dropepd["1-day-predict"].astype('int').to_numpy()
test_y_5_day_pred = df_test_dropepd["5-day-predict"].astype('int').to_numpy()

pred_1_day = clf_1_day_pred.predict(test_x)
pred_5_day = clf_5_day_pred.predict(test_x)
pred_1_day.sum(), pred_5_day.sum()

(13, 445)

## Score test results

Reminder: These are undersampled data results, the actual results are probably different from them.

In [None]:
precision, recall, fscore, _ = precision_recall_fscore_support(pred_1_day, 
                                                               test_y_1_day_pred, 
                                                               average="binary", # 
                                                               labels=[0, 1], # labels
                                                               beta=1) # f1 score

print("*********** SCORE for 1-DAY predict")
print(f"precision : {precision:.4f}")
print(f"recall    : {recall:.4f}")
print(f"f-score   : {fscore:.4f}")


precision, recall, fscore, _ = precision_recall_fscore_support(pred_5_day,        # y'
                                                               test_y_5_day_pred, # y
                                                               average="binary",  # focus only True class
                                                               labels=[0, 1],     # labels
                                                               beta=1)            # f1 score
print()
print("*********** SCORE for 5-DAY predict ***********")
print(f"precision : {precision:.4f}")
print(f"recall    : {recall:.4f}")
print(f"f-score   : {fscore:.4f}")

*********** SCORE for 1-DAY predict
precision : 0.0082
recall    : 0.1538
f-score   : 0.0155

*********** SCORE for 5-DAY predict ***********
precision : 0.2867
recall    : 0.6360
f-score   : 0.3953


# Validation Data Usage

## Read Validation Data

In [None]:
# Download test data from github
!wget -nc https://github.com/Turkcell/ITU-AIMLin5GChallenge-2021/raw/main/RLF_Prediction_ITU_AIML_Challenge_Data/RLF_Prediction_ITU_AIML_Challenge_Test_20210125.7z
!7z x RLF_Prediction_ITU_AIML_Challenge_Test_20210125.7z -aos RLF_Prediction_ITU_AIML_Challenge_Test_20210125/


--2021-09-15 11:28:37--  https://github.com/Turkcell/ITU-AIMLin5GChallenge-2021/raw/main/RLF_Prediction_ITU_AIML_Challenge_Data/RLF_Prediction_ITU_AIML_Challenge_Test_20210125.7z
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Turkcell/ITU-AIMLin5GChallenge-2021/main/RLF_Prediction_ITU_AIML_Challenge_Data/RLF_Prediction_ITU_AIML_Challenge_Test_20210125.7z [following]
--2021-09-15 11:28:38--  https://raw.githubusercontent.com/Turkcell/ITU-AIMLin5GChallenge-2021/main/RLF_Prediction_ITU_AIML_Challenge_Data/RLF_Prediction_ITU_AIML_Challenge_Test_20210125.7z
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 O

In [None]:
data_zip_path = "./RLF_Prediction_ITU_AIML_Challenge_Test_20210125/RegionA_test_20210125.zip"

# Example read for RL-KPIS
validation_rl_kpis = read_table_from_zip(data_zip_path, "rl-kpis.tsv")
print(f"validation_rl_kpis.shape: {validation_rl_kpis.shape}")
print(validation_rl_kpis.datetime.min(), validation_rl_kpis.datetime.max())
validation_rl_kpis.tail(5)

validation_rl_kpis.shape: (50978, 19)
2021-01-25 00:00:00 2021-02-13 00:00:00


Unnamed: 0,type,datetime,tip,mlid,mw_connection_no,site_id,card_type,adaptive_modulation,freq_band,severaly_error_second,error_second,unavail_second,avail_time,bbe,rxlevmax,scalibility_score,capacity,modulation,rlf
50973,NEC,2021-02-13,NEAR,A6FD,1371370,RL_bKZCQ,cardtype5,Enable,f3,0,0,0.0,86400,0.0,-37.5,0.0,495,2048QAM*,False
50974,NEC,2021-02-13,NEAR,A6NA,1371370,RL_bKZCQ,cardtype5,Enable,f3,0,0,0.0,86400,0.0,-37.5,0.0,495,2048QAM*,False
50975,NEC,2021-02-13,FAR,A8FJ,1371440,RL_bKZCQ,cardtype5,Enable,f3,0,0,0.0,86400,0.0,-28.9,0.0,495,2048QAM*,False
50976,NEC,2021-02-13,FAR,A8HV,1371440,RL_bKZCQ,cardtype5,Enable,f3,0,0,0.0,86400,0.0,-28.9,0.0,495,2048QAM*,False
50977,NEC,2021-02-13,NEAR,A4ZO,1348886,RL_bLTQH,cardtype5,Enable,f3,0,0,0.0,86400,0.0,-38.3,0.0,495,2048QAM*,False


## Prediction Phase

In [None]:
df_validation_dropped, validation_x, _, _ = preprocessing(validation_rl_kpis, 
                                                          numerical_columns=numerical_features, 
                                                          categorical_columns=categorical_features, 
                                                          one_hot_encoder=one_hot_encoder)

In [None]:
validation_pred_1_day = clf_1_day_pred.predict(validation_x)
validation_pred_5_day = clf_5_day_pred.predict(validation_x)
print(f"1-day RLF sum: {validation_pred_1_day.sum()} | 5-day RLF sum: {validation_pred_5_day.sum()}")

df_validation_dropped["1-day-predict"] = validation_pred_1_day > 0.5
df_validation_dropped["5-day-predict"] = validation_pred_5_day > 0.5

link_based_prediction_results = df_validation_dropped[["datetime", "site_id", "mlid", "rlf", "1-day-predict", "5-day-predict"]]
link_based_prediction_results.tail()

1-day RLF sum: 16 | 5-day RLF sum: 4250


Unnamed: 0,datetime,site_id,mlid,rlf,1-day-predict,5-day-predict
50973,2021-02-13,RL_bKZCQ,A6FD,False,False,False
50974,2021-02-13,RL_bKZCQ,A6NA,False,False,False
50975,2021-02-13,RL_bKZCQ,A8FJ,False,False,False
50976,2021-02-13,RL_bKZCQ,A8HV,False,False,False
50977,2021-02-13,RL_bLTQH,A4ZO,False,False,False


## Prepare the table for queried links


### Read link table
In this table, we will try to predict RLF (1-day single and 5-day interval) for each link (site_id, mlid) at given date (datetime)

In [None]:
df_queried_links = pd.read_csv("RLF_Prediction_ITU_AIML_Challenge_Test_20210125/20210125_predicts.tsv", sep="\t", index_col=0)
df_queried_links["datetime"] = pd.to_datetime(df_queried_links["datetime"])
df_queried_links.head()

Unnamed: 0,datetime,site_id,mlid,1-day-predict,5-day-predict
0,2021-02-13,RL_;ABDV,A0BE,,
1,2021-02-13,RL_;ABDV,A0BI,,
2,2021-02-13,RL_;ABDV,A5AB,,
3,2021-02-13,RL_;ABDV,A8CQ,,
4,2021-02-13,RL_;ABDV,A8DQ,,


### Join predictions with the table

In [None]:
df_queried_links = df_queried_links[["datetime", "site_id", "mlid"]].merge(
    link_based_prediction_results,
    how = "left", 
    on = ("site_id", "mlid", "datetime"),
    # right_on = ("site_id", "mlid", "datetime"),
    # suffixes = ("", "_y")
    )
df_queried_links.head()

Unnamed: 0,datetime,site_id,mlid,rlf,1-day-predict,5-day-predict
0,2021-02-13,RL_;ABDV,A0BE,False,False,False
1,2021-02-13,RL_;ABDV,A0BI,False,False,False
2,2021-02-13,RL_;ABDV,A5AB,False,False,False
3,2021-02-13,RL_;ABDV,A8CQ,False,False,False
4,2021-02-13,RL_;ABDV,A8DQ,False,False,False


In [None]:
df_queried_links[["1-day-predict", "5-day-predict"]].sum()

1-day-predict      1.0
5-day-predict    171.0
dtype: float64

In [None]:
# Save results to file
df_queried_links.to_csv("20210125_predicts.tsv", sep="\t")
!cat "20210125_predicts.tsv"

	datetime	site_id	mlid	rlf	1-day-predict	5-day-predict
0	2021-02-13	RL_;ABDV	A0BE	False	False	False
1	2021-02-13	RL_;ABDV	A0BI	False	False	False
2	2021-02-13	RL_;ABDV	A5AB	False	False	False
3	2021-02-13	RL_;ABDV	A8CQ	False	False	False
4	2021-02-13	RL_;ABDV	A8DQ	False	False	False
5	2021-02-13	RL_;ABDV	A9ZA	False	False	False
6	2021-02-13	RL_;ABL>	A0EB	False	False	False
7	2021-02-13	RL_;ABL>	A0IB	False	False	False
8	2021-02-13	RL_;ABL>	A9AM	False	False	False
9	2021-02-13	RL_;ABTP	A5ZA			
10	2021-02-13	RL_;AC?E	A8CO	False	False	False
11	2021-02-13	RL_;AC?E	A8DO	False	False	False
12	2021-02-13	RL_;AC?E	A9ML	False	False	False
13	2021-02-13	RL_;AC?E	A9MY	False	False	False
14	2021-02-13	RL_;AC?E	A9PM	False	False	False
15	2021-02-13	RL_;ADKP	A0DE	False	False	False
16	2021-02-13	RL_;ADKP	A5AN	False	False	False
17	2021-02-13	RL_;ADR@	A5OP	False	False	False
18	2021-02-13	RL_;ADR@	A6MF	False	False	False
19	2021-02-13	RL_;ADR@	A9LZ	False	False	False
20	2021-02-13	RL_;AICJ	A5NR	False	False	False
21	2

# Summary

In this project, we want to demonstrate how to read files, how to prepare labels for 1-day-prediction (single day) and 5-day-prediction (interval). Also, we want to show that how will we send you test data and queried links. In the test phase, we will share data that consist of validation data and a table for queried links. The table's columns are datetime, site_id, mlid, and predictions. You will fill the table with your predictions. 

You can freely develop your models, we will let you run your codes in your own environment. We will evaluate your results by using the prediction file.