Read in the pickle file and find the top five occurance rate payloads in the file
Return the payloads as 'text'

In [2]:
import pandas as pd
import pickle

def load_data(file_path):
    # Load the dataset from the pickle file
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

def find_top_payloads(data):
    # Find the five highest occurance rate payloads
    payload_counts = data['payload'].value_counts().head(5)

    # Disregard the duplicate payloads
    top_payloads = data[data['payload'].isin(payload_counts.index)].drop_duplicates(subset=['payload'])

    # Select only the 'payload' and 'text' columns 
    top_payloads = top_payloads[['payload', 'text']]

    # Add percentage of the payload's occurance rate
    top_payloads['count'] = top_payloads['payload'].map(payload_counts)

    return top_payloads

files = ['imap_report_with_tknscore_new.pkl', 'pop_report_with_tknscore_new.pkl', 'sip_report_with_tknscore_new.pkl', 'smtp_report_with_tknscore_new.pkl' ]
for file_path in files:
    print(file_path)
    data = load_data(file_path)
    top_payloads = find_top_payloads(data)
    print(top_payloads)

imap_report_with_tknscore_new.pkl
                                                 payload  \
0                         61303031205354415254544c530d0a   
4                     41303031204341504142494c4954590d0a   
8      413030322049442028226e616d65222022446f7665636f...   
481                           41303033204c4f474f55540d0a   
13368  0300002f2ae00000000000436f6f6b69653a206d737473...   

                                 text  percentage  
0                   a001 STARTTLS\r\n         954  
4                 A001 CAPABILITY\r\n        2007  
8      A002 ID ("name" "Dovecot")\r\n        1859  
481                   A003 LOGOUT\r\n        1474  
13368                             NaN        2169  
pop_report_with_tknscore_new.pkl
                                  payload                    text  percentage
5                          1503010002020a                             6152
8    474554202f20485454502f312e300d0a0d0a  GET / HTTP/1.0\r\n\r\n        4184
27                        