In [19]:
import pandas as pd
import glob
import time

In [20]:
#Script settings

#Set input file directory
speedtest_input_directory = "data/Speedtest/"

#Set output file names
unifi_output = "output/speedtest_unifi_20220213.csv"
celcom_output = "output/speedtest_celcom_20220213.csv"
maxis_output = "output/speedtest_maxis_20220213.csv"
digi_output = "output/speedtest_digi_20220213.csv"
umobile_output = "output/speedtest_umobile_20220213.csv"
dnb_output = "output/speedtest_dnb_20220213.csv"

In [21]:
cols = [
    'download_kbps',
    'upload_kbps',
    'network_operator_name',
    'mcc',
    'pre_connection_type',
    'post_connection_type',
    'sim_network_operator_name_a',
    'alt_sim_network_operator_name_a',
    'pci_a',
    'tac_a',
    'cid_a',
    'lac_a',
    'psc_a',
    'uarfcn_a',
    'arfcn_a',
    'bsic_a',
    'earfcn_a',
    'rsrp_a',
    'rsrq_a',
    'rssnr_a',
    'cqi_a',
    'download_kb_a',
    'upload_kb_a',
    'client_latitude_start',
    'client_longitude_start',
    'cellbandwidth_a',
    'ss_rsrp_a',
    'ss_rsrq_a',
    'ss_sinr_a',
    'csi_rsrp_a',
    'csi_rsrq_a',
    'csi_sinr_a',
    'nr_level_a',
    'nr_asu_a',
    'nr_arfcn_a',
    'nci_a',
    'nr_pci_a',
    'nr_tac_a',
    'nr_mcc_a',
    'nr_mcc_a',
    'nr_state_a',
    'nr_frequency_range_a',
    'is_using_carrier_aggregation_a',
    'cell_bandwidths_a',
    'downstream_bandwidth_kbps_a',
    'gsm_additional_plmns_a',
    'wcdma_additional_plmns_a',
    'lte_additional_plmns_a',
    'lte_bands_a',
    'nr_additional_plmns_a',
    'nr_bands_a',
    'gsm_rssi_a',
    'wcdma_ecno_a'
]

dtypes = {
    'download_kbps': str,
    'upload_kbps': str,
    'network_operator_name' : str,
    'mcc' : 'Int64',
    'pre_connection_type' : 'Int64',
    'post_connection_type': 'Int64',
    'sim_network_operator_name_a' : str,
    'alt_sim_network_operator_name_a': str,
    'pci_a': str,
    'tac_a': str,
    'cid_a': str,
    'lac_a': str,
    'psc_a': str,
    'uarfcn_a': str,
    'arfcn_a': str,
    'bsic_a': str,
    'earfcn_a': str,
    'rsrp_a': str,
    'rsrq_a': str,
    'rssnr_a': str,
    'cqi_a': str,
    'download_kb_a': str,
    'upload_kb_a': str,
    'client_latitude_start': str,
    'client_longitude_start': str,
    'cellbandwidth_a': str,
    'ss_rsrp_a': str,
    'ss_rsrq_a': str,
    'ss_sinr_a': str,
    'csi_rsrp_a': str,
    'csi_rsrq_a': str,
    'csi_sinr_a': str,
    'nr_level_a': str,
    'nr_asu_a': str,
    'nr_arfcn_a': str,
    'nci_a': str,
    'nr_pci_a': str,
    'nr_tac_a': str,
    'nr_mcc_a': str,
    'nr_mcc_a': str,
    'nr_state_a': str,
    'nr_frequency_range_a': str,
    'is_using_carrier_aggregation_a': str,
    'cell_bandwidths_a': str,
    'downstream_bandwidth_kbps_a': str,
    'gsm_additional_plmns_a': str,
    'wcdma_additional_plmns_a': str,
    'lte_additional_plmns_a': str,
    'lte_bands_a': str,
    'nr_additional_plmns_a': str,
    'nr_bands_a': str,
    'gsm_rssi_a': str,
    'wcdma_ecno_a': str
}

In [22]:
#Function to extract Speedtest data
def extract_speedtest_data(speedtest_filename:str):
    df = pd.read_csv(speedtest_filename,usecols=cols, low_memory=False)
    #mcc_code = 502
    df = df[df['mcc'] == 502]
    df = df[df['network_operator_name'].notna()]
    #rsrp_a OR nr_ss_rsrp  is not null
#     df = df[(df['rsrp_a'].notna()) | (df['nr_ss_rsrp'].notna())]
    df = df[df['rsrp_a'].notna()]
    #pre_connection_type is either 15 ,21 or 24
    df = df[(df['pre_connection_type'] == 15) | (df['pre_connection_type'] == 21) | (df['pre_connection_type'] == 24)]
    #post_connection_type is either 15 ,21 or 24
    df = df[(df['post_connection_type'] == 15) | (df['post_connection_type'] == 21) | (df['post_connection_type'] == 24)]
    return df

#Define column name which script can read and exlude others (Faster processing time)
cols = ['mcc', 'network_operator_name', 'rsrp_a', 'pre_connection_type', 'post_connection_type']

#Get list of input BGS files from given directory
speedtest_input_files = glob.glob(speedtest_input_directory+"*.csv")

##Script execution

# starting time
start = time.time()
print(start)
df = pd.DataFrame()
frames = []
for file in speedtest_input_files:
    temp_df = extract_speedtest_data(file)
    frames.append(temp_df)
    print(file + ' completed')
    
result = pd.concat(frames)
# end time
end = time.time()
print(f"Runtime of the program is {end - start}")

1645020525.253087
data/Speedtest/android_2022-01-02.csv completed
data/Speedtest/android_2022-01-03.csv completed
data/Speedtest/android_2022-01-01.csv completed
Runtime of the program is 1.6972949504852295


In [23]:
#add new column 'grid_id'
result['grid_id'] = ''

#export processed data to output file per operator
unifi = result[result['network_operator_name'] == 'Unifi']
celcom = result[result['network_operator_name'] == 'Celcom']
maxis = result[result['network_operator_name'] == 'Maxis']
digi = result[result['network_operator_name'] == 'Digi']
umobile = result[result['network_operator_name'] == 'U Mobile']
dnb = result[result['network_operator_name'] == 'Digital Nasional']

In [24]:
#save output file in csv format
unifi.to_csv(unifi_output)
celcom.to_csv(celcom_output)
maxis.to_csv(maxis_output)
digi.to_csv(digi_output)
umobile.to_csv(umobile_output)
dnb.to_csv(dnb_output)