# DATA PREPROCESSING

In [None]:
!pip install joblib

In [38]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.preprocessing import LabelBinarizer
import re
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.feature_selection import SelectFromModel
import seaborn as sns

The path to the file should be adjusted

In [39]:
input_csv_file = 'csv 20 minutes/labeled2/features_final/together/all_data_final.csv'

In [41]:
# Shows all numerical, categorial and boolean features
def get_feature_types(df):
    numerical_features = []
    categorical_features = []
    boolean_features = []
    for column_name, column_type in df.dtypes.items():
        if column_type in ['int64', 'float64']:
            numerical_features.append(column_name)
        elif column_type == 'object':
            categorical_features.append(column_name)
        elif column_type == 'bool':
            boolean_features.append(column_name)

    return numerical_features, categorical_features, boolean_features

Missing Values, Empty Ones

In [42]:
# Lists all columns which are more than 99% empty
def list_empty_columns(csv_file):
    df = pd.read_csv(csv_file, delimiter=';', encoding='ISO-8859-1')
    missing_percentages = (df.isnull().sum() / len(df)) * 100
    empty_columns = missing_percentages[missing_percentages > 99].index.tolist()
    return empty_columns

In [43]:
empty_columns = list_empty_columns(input_csv_file)
print("Columns with more than 99% empty values:")
print(empty_columns)

Columns with more than 99% empty values:
['Target Address', 'Address Type', 'Simultaneous LE and BR/EDR to Same Device Capable (Host).1', 'LE Supported By Host', 'OOB Data Present', 'Hash C', 'Custom UUID', 'BIG_Offset', 'BIG_Offset_Units', 'ISO_Interval', 'Num_BIS', 'NSE', 'BN', 'Sub_Interval', 'PTO', 'BIS_Spacing', 'IRC', 'Max_PDU', 'Reserved.1', 'Seed Access Address', 'SSP OOB Length', 'Malformed Packet']


Columns with same values

In [44]:
# Lists all columns which have same values and are constants
def list_same_value_columns(csv_file):
    df = pd.read_csv(csv_file, delimiter=';', encoding='ISO-8859-1')
    same_value_columns = []
    for column in df.columns:
        if df[column].nunique() == 1:
            same_value_columns.append(column)
    return same_value_columns

In [45]:
same_value_columns = list_same_value_columns(input_csv_file)
print("Columns with always the same values:")
print(same_value_columns)

Columns with always the same values:
['Length of packet', 'PHY', 'Access Address']


Identical column

In [46]:
# Lists all columns which are identical to each other
def check_identical_columns(input_file):
    df = pd.read_csv(input_file, delimiter=';', error_bad_lines=False, encoding='ISO-8859-1')
    columns_to_remove = set()
    for i, col1 in enumerate(df.columns):
        for j, col2 in enumerate(df.columns):
            if i < j: 
                if df[col1].equals(df[col2]):
                    columns_to_remove.add(col2)

    columns_to_remove = list(columns_to_remove)

    if columns_to_remove:
        print("Columns to remove:")
        print(columns_to_remove)
    else:
        print("No identical columns found.")

In [47]:
check_identical_columns(input_csv_file)

Columns to remove:
['Variance Payload Length 1', 'Standard Deviation Payload Length 1']


Data Cleaning

In [79]:
data = pd.read_csv(input_csv_file, delimiter=';', encoding='ISO-8859-1')

# Drop columns with more than 99% empty values
empty_columns = ['Target Address', 'Address Type', 'Simultaneous LE and BR/EDR to Same Device Capable (Host).1', 
                 'LE Supported By Host', 'OOB Data Present', 'Hash C', 'Custom UUID', 'BIG_Offset', 'BIG_Offset_Units', 
                 'ISO_Interval', 'Num_BIS', 'NSE', 'BN', 'Sub_Interval', 'PTO', 'BIS_Spacing', 'IRC', 'Max_PDU', 
                 'Reserved.1', 'Seed Access Address', 'SSP OOB Length', 'Malformed Packet']
data = data.drop(empty_columns, axis=1)

# Drop columns with constant values
constant_columns = ['Length of packet', 'PHY', 'Access Address']
data = data.drop(constant_columns, axis=1)

# Drop unnecessary columns
unnecessary_columns = ['No.', 'Time', 'Protocol', 'RSSI', 'Packet Count']
data = data.drop(unnecessary_columns, axis=1)

# Drop unique identifier columns
unique_columns = ['Advertising Address', 'Company ID', 'UUID 16', 'Device Name']
data = data.drop(unique_columns, axis=1)

# Drop identical columns
identical_columns = ['Variance Payload Length 1']
data = data.drop(identical_columns, axis=1)

Print all categorial features which need to be transformer into numerical representation

In [80]:
numerical_features, categorical_features, boolean_features = get_feature_types(data)
print(categorical_features)

['Packet Header', 'Type', 'Length', 'PDU Type', 'Info', 'Channel Selection Algorithm', 'Tx Address', 'Rx Address', 'Scanning Address', 'Simultaneous LE and BR/EDR to Same Device Capable (Host)', 'Simultaneous LE and BR/EDR to Same Device Capable (Controller)', 'BR/EDR Not Supported', 'LE General Discoverable Mode', 'LE Limited Discoverable Mode', 'Power Level (dBm)', 'Data', 'Custom UUID.1', 'Service Data', 'BD_ADDR', 'CRC', 'label', 'subcategory', 'Packet Direction']


Transform Packet Header column into Integer

In [81]:
# Packet Header column is categorial which starts with '0x' and by removing '0x' then it is numerical
# So: check if all values in the Packet Header column start with '0x'
all_start_with_0x = all(header.startswith('0x') for header in data['Packet Header'].unique())
print("All values in Packet Header column start with '0x':", all_start_with_0x)

# Also: check if all values in Packet Header column have hexadecimal digits after '0x'
hexidecimal_digit_pattern = re.compile(r'^0x[0-9a-fA-F]+$')
all_match_pattern = all(hexidecimal_digit_pattern.match(header) for header in data['Packet Header'].unique())
print("All values in Packet Header column have hexadecimal digits after '0x':", all_match_pattern)

All values in 'packet header' column start with '0x': True
All values in 'packet header' column have hexadecimal digits after '0x': True


In [82]:
# Check if the transformed Packet Header column is in numerical representation
data['Packet Header'] = data['Packet Header'].apply(lambda x: int(x[2:], 16))
print("Data type of Packet Header column:", data['Packet Header'].dtype)

Data type of packet header column: int64


Transform Power Level column into Integer

In [83]:
# the Power Level column is categorial because of the ending 'dBm', so remove this ending to get the column in numerical representation
def nonnumeric_power_level(data):
    non_numeric_types = []
    for index, row in data.iterrows():
        power_level = row['Power Level (dBm)']
        if isinstance(power_level, str):
            try:
                power_level = float(power_level)
            except ValueError:
                pass
        elif isinstance(power_level, int):
            power_level = float(power_level)
        if not isinstance(power_level, float):
            non_numeric_type = type(power_level)
            if non_numeric_type not in non_numeric_types:
                non_numeric_types.append(non_numeric_type)
            print(f"Non-numeric value '{power_level}' with type '{non_numeric_type}' in row {index}, Power Level (dBm)")
            data.at[index, 'Power Level (dBm)'] = float(0)

    print("\nUnique non-numeric types in 'Power Level (dBm)' column:")
    print(non_numeric_types)
    return data

# some Power Level column values have not just one integer, so take the first integer if there are more than one integer
def preprocess_power_level(data):
    for index, row in data.iterrows():
        power_level = row['Power Level (dBm)']
        if isinstance(power_level, str):
            parts = power_level.split(',')
            try:
                power_level = float(parts[0])
            except ValueError:
                pass
            data.at[index, 'Power Level (dBm)'] = power_level
    return data

In [84]:
data = nonnumeric_power_level(data)
data = preprocess_power_level(data)
data['Power Level (dBm)'] = data['Power Level (dBm)'].astype(float)

Non-numeric value '26,-44' with type '<class 'str'>' in row 142702, Power Level (dBm)

Unique non-numeric types in 'Power Level (dBm)' column:
[<class 'str'>]


Handle Type and Length columns

In [85]:
data['Type'].fillna('0x', inplace=True)
unique_types_set = set()
type_length_dict = {}
# get unique values of all Type columns values
for type_value in data['Type'].unique():
    split_values = str(type_value).split(',')
    for value in split_values:
        value = value.strip()
        if not value.startswith('0x'):
            unique_types_set.add(value)

print("Length of unique_types_set:", len(unique_types_set))
sorted_types = sorted(unique_types_set)
print(sorted_types)

Length of unique_types_set: 49
['128-bit Service Class UUIDs', '128-bit Service Class UUIDs (incomplete)', '16-bit Service Class UUIDs', '16-bit Service Class UUIDs (incomplete)', '32-bit Service Class UUIDs', '32-bit Service Class UUIDs (incomplete)', '3D Information Data', 'Advertising Interval', 'Advertising Interval - long', 'Appearance', 'BD_ADDR', 'BIGInfo', 'Broadcast Code', 'Broadcast_Name', 'Channel Map Update Indication', 'Class Of Device', 'Device ID / Security Manager TK Value', 'Device Name', 'Device Name (shortened)', 'Flags', 'Indoor Positioning', 'LE Bluetooth Device Address', 'LE Role', 'LE Secure Connections Confirmation Value', 'LE Secure Connections Random Value', 'LE Supported Features', 'List of 128-bit Service Solicitation UUIDs', 'List of 16-bit Service Solicitation UUIDs', 'List of 32-bit Service Solicitation UUIDs', 'Manufacturer Specific', 'Mesh Beacon', 'Mesh Message', 'OOB Optional Data Length', 'PB-ADV', 'Peripheral Connection Interval Range', 'Public Targ

In [86]:
# Add new columns of Types and Lengths
for column_name in sorted_types:
    if column_name == 'BD_ADDR':
        column_name = 'BD_ADDR2'
    data[column_name] = 0
    data[column_name + '_Length'] = 0
    
for index, row in data.iterrows():
    types = row['Type'].split(',')
    lengths = str(row['Length']).split(',')
    
    for t, l in zip(types, lengths):
        # Update columns based on Types and lengths values
        if t == 'BD_ADDR':
            t = 'BD_ADDR2'
        if t in sorted_types:
            # Update the Type column with 1
            data.at[index, t] = 1
            # Update the length column with corresponding length value
            data.at[index, t + '_Length'] = float(l)

In [87]:
# remove Type and Length columns
handled_columns = ['Type', 'Length']
data = data.drop(handled_columns, axis=1)

Handle categorial columns

Save subcategory for later data splitting !

In [88]:
#Save subcategory for later data splitting !
subcategory_index = data.columns.get_loc('subcategory')
columns_to_convert = data.columns[subcategory_index + 1:]
columns_to_convert = columns_to_convert[columns_to_convert != 'Packet Direction']
for col in columns_to_convert:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [90]:
numerical_features, categorical_features, boolean_features = get_feature_types(data)
print(categorical_features)

['PDU Type', 'Info', 'Channel Selection Algorithm', 'Tx Address', 'Rx Address', 'Scanning Address', 'Simultaneous LE and BR/EDR to Same Device Capable (Host)', 'Simultaneous LE and BR/EDR to Same Device Capable (Controller)', 'BR/EDR Not Supported', 'LE General Discoverable Mode', 'LE Limited Discoverable Mode', 'Data', 'Custom UUID.1', 'Service Data', 'BD_ADDR', 'CRC', 'label', 'subcategory', 'Packet Direction']


In [92]:
# get for each categorial feature their unique value count
def get_categorial_feature_types_count(df):
    unique_counts = {}
    for column_name, column_type in df.dtypes.items():
        if column_type == 'object':
            unique_counts[column_name] = df[column_name].nunique()

    return unique_counts
unique_counts = get_categorial_feature_types_count(data)

In [94]:
# append features in the list which have more than 50 unique values
too_much_categorial = []
for feature, count in unique_counts.items():
    print(f"Feature '{feature}' has {count} unique values.")
    if count > 50:
        too_much_categorial.append(feature)

print('\n')
print(too_much_categorial)

Feature 'PDU Type' has 7 unique values.
Feature 'Info' has 24 unique values.
Feature 'Channel Selection Algorithm' has 2 unique values.
Feature 'Tx Address' has 2 unique values.
Feature 'Rx Address' has 2 unique values.
Feature 'Scanning Address' has 617 unique values.
Feature 'Simultaneous LE and BR/EDR to Same Device Capable (Host)' has 10 unique values.
Feature 'Simultaneous LE and BR/EDR to Same Device Capable (Controller)' has 10 unique values.
Feature 'BR/EDR Not Supported' has 10 unique values.
Feature 'LE General Discoverable Mode' has 9 unique values.
Feature 'LE Limited Discoverable Mode' has 10 unique values.
Feature 'Data' has 6819 unique values.
Feature 'Custom UUID.1' has 47 unique values.
Feature 'Service Data' has 1869 unique values.
Feature 'BD_ADDR' has 382 unique values.
Feature 'CRC' has 7487 unique values.
Feature 'label' has 10 unique values.
Feature 'subcategory' has 36 unique values.
Feature 'Packet Direction' has 2 unique values.


['Scanning Address', 'Data', 

In [95]:
# remove categorial columns which have more than 50 unique values
too_much_categorial_columns = ['Scanning Address', 'Data', 'Custom UUID.1', 'Service Data', 'BD_ADDR', 'CRC']
data = data.drop(too_much_categorial_columns, axis=1)

This are the remaining categorial features

In [96]:
numerical_features, categorical_features, boolean_features = get_feature_types(data)
print(categorical_features)

['PDU Type', 'Info', 'Channel Selection Algorithm', 'Tx Address', 'Rx Address', 'Simultaneous LE and BR/EDR to Same Device Capable (Host)', 'Simultaneous LE and BR/EDR to Same Device Capable (Controller)', 'BR/EDR Not Supported', 'LE General Discoverable Mode', 'LE Limited Discoverable Mode', 'label', 'subcategory', 'Packet Direction']


SAVE FILE

In [97]:
output_csv_file = 'csv 20 minutes/labeled2/features_final/together/all_data_final_before_one_hot_encoded.csv'
data.to_csv(output_csv_file, index=False)

LOAD FILE

In [None]:
before_one_hot_encoded_csv = 'csv 20 minutes/labeled2/features_final/together/all_data_final_before_one_hot_encoded.csv'
data = pd.read_csv(before_one_hot_encoded_csv, delimiter=';', encoding='ISO-8859-1')

Data Transformation using One-Hot Encoding

In [99]:
def one_hot_encode_features(data, categorical_features):
    # Remove 'label' and 'subcategory' columns from categorical features list
    if 'label' in categorical_features:
        categorical_features.remove('label')
    if 'subcategory' in categorical_features:
        categorical_features.remove('subcategory')
    # Do one-hot encoding on the remaining categorical features
    data = pd.get_dummies(data, columns=categorical_features, drop_first=True)
    return data

In [101]:
data = one_hot_encode_features(data, categorical_features)

Handle empty values: fill empty values with 0

In [102]:
data.fillna(0, inplace=True)

In [103]:
# Print total number of rows
total_rows = data.shape[0]
print("Total number of rows:", total_rows)

# Check if there are empty values
empty_rows = data[data.isna().any(axis=1)]
print("Rows with NaN values:")
for index, row in empty_rows.iterrows():
    empty_columns = row[row.isna()].index.tolist()
    print(f"Row index {index}, empty columns: {empty_columns}")

Total number of rows: 901623
Rows with NaN values:


SAVE FILE

In [104]:
output_csv_file = 'csv 20 minutes/labeled2/features_final/together/all_data_final.csv'
data.to_csv(output_csv_file, index=False)

LOAD FILE

In [None]:
one_hot_encoded_csv = 'csv 20 minutes/labeled2/features_final/together/all_data_final.csv'
data = pd.read_csv(one_hot_encoded_csv, delimiter=';', encoding='ISO-8859-1')

The remaining categorial columns are the label and subcategory columns

In [105]:
numerical_features, categorical_features, boolean_features = get_feature_types(data)
print(categorical_features)

['label', 'subcategory']


# Data Splitting

Split into X=features and y=labels

In [109]:
X = data.drop(['label', 'subcategory'], axis=1)
y = data['label']

Save X and y

In [110]:
joblib.dump(X, 'csv 20 minutes/labeled2/features_final/together/X.pkl')
joblib.dump(y, 'csv 20 minutes/labeled2/features_final/together/y.pkl')

['csv 20 minutes/labeled2/features_final/together/y.pkl']

Load X and y

In [None]:
X = joblib.load('csv 20 minutes/labeled2/features_final/together/X.pkl')
y = joblib.load('csv 20 minutes/labeled2/features_final/together/y.pkl')

## 80/20 Data Splitting Approach

In [111]:
# Split data into training and test datasets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

SAVE TRAINING AND TEST DATASETS

In [113]:
joblib.dump(X_train1, 'csv 20 minutes/labeled2/features_final/together/X_train1.pkl')
joblib.dump(X_test1, 'csv 20 minutes/labeled2/features_final/together/X_test1.pkl')
joblib.dump(y_train1, 'csv 20 minutes/labeled2/features_final/together/y_train1.pkl')
joblib.dump(y_test1, 'csv 20 minutes/labeled2/features_final/together/y_test1.pkl')

['csv 20 minutes/labeled2/features_final/together/y_test1.pkl']

LOAD TRAINING AND TEST DATASETS

In [None]:
X_train1 = joblib.load('csv 20 minutes/labeled2/features_final/together/X_train1.pkl')
X_test1 = joblib.load('csv 20 minutes/labeled2/features_final/together/X_test1.pkl')
y_train1 = joblib.load('csv 20 minutes/labeled2/features_final/together/y_train1.pkl')
y_test1 = joblib.load('csv 20 minutes/labeled2/features_final/together/y_test1.pkl')

## Alternative Data Splitting Approach

In [114]:
# get for each device type its unique subcategories and the number of packets of them
def print_unique_subcategories_with_counts():
    all_data_final = 'csv 20 minutes/labeled2/features_final/together/all_data_final.csv'
    data2 = pd.read_csv(all_data_final, delimiter=';', encoding='ISO-8859-1', error_bad_lines=False)
    
    # Group by 'label' columns
    labels = data2['label'].unique()
    # For each label, get its unique subcategories and the number of packets of them
    for label in labels:
        subcategories = data2[data2['label'] == label]['subcategory'].value_counts()
        print(f"Label: {label}")
        print("Unique subcategories and their counts:")
        for subcategory, count in subcategories.items():
            print(f"  - {subcategory}: {count}")
        print()

print_unique_subcategories_with_counts()

Label: Laptop
Unique subcategories and their counts:
  - Dell Laptop: 66187
  - Hp Laptop: 54045
  - Apple Laptop: 7561

Label: Headphone
Unique subcategories and their counts:
  - Apple Headphone: 43456
  - Sony Linkbuds Headphone: 43111
  - Sony XM4 Headphone: 33335
  - Sony XM3 Headphone: 15088
  - Bose-2 Headphone: 4552
  - Bose-1 Headphone: 3319
  - Samsung Headphone: 1811
  - Beats Headphone: 16

Label: iPad
Unique subcategories and their counts:
  - Apple iPad: 74453

Label: Airtag
Unique subcategories and their counts:
  - Apple Airtag: 16281

Label: TV
Unique subcategories and their counts:
  - LG TV: 18657
  - Samsung TV: 9467

Label: Kitchen
Unique subcategories and their counts:
  - Kettle: 29756
  - Airfryer: 4805
  - Mixer: 3679

Label: Smartwatch
Unique subcategories and their counts:
  - Apple Smartwatch: 82528
  - Ericsson Smartwatch: 17371
  - Huawei Smartwatch: 6567
  - Fitbit Smartwatch: 2070
  - Galaxy Smartwatch: 46

Label: Camera
Unique subcategories and their co

In [115]:
# Split the data into training and test datasets
def split_train_test_approach2(data):
    train_data = pd.DataFrame(columns=data.columns)
    test_data = pd.DataFrame(columns=data.columns)
    
    labels = data['label'].unique()
    
    for label in labels:
        label_data = data[data['label'] == label]
        subcategories = label_data['subcategory'].value_counts()
        
        # If there is only one subcategory of a device type then split randomly by 80/20 data splitting method
        if len(subcategories) == 1:
            train, test = train_test_split(label_data, test_size=0.2, random_state=42)
        # Else: exclude the subcategory with the least number of packets from the training dataset
        else:
            subcategory_to_exclude = subcategories.idxmin()
            remaining_data = label_data[label_data['subcategory'] != subcategory_to_exclude]
            train, test_remain = train_test_split(remaining_data, test_size=0.2, random_state=42)
            test_exclude = label_data[label_data['subcategory'] == subcategory_to_exclude]
            test = pd.concat([test_remain, test_exclude])
        
        train_data = pd.concat([train_data, train])
        test_data = pd.concat([test_data, test])
    
    total_count = len(data)
    train_count = len(train_data)
    test_count = len(test_data)
    print(f"Total data count: {total_count}")
    print(f"Training data count: {train_count} ({(train_count / total_count) * 100:.2f}%)")
    print(f"Testing data count: {test_count} ({(test_count / total_count) * 100:.2f}%)")
    
    # For each label print their subcategories included in the training and test datasets
    for label in labels:
        train_subcategories = train_data[train_data['label'] == label]['subcategory'].unique()
        test_subcategories = test_data[test_data['label'] == label]['subcategory'].unique()
        print(f"Label: {label}")
        print(f"  Training subcategories: {', '.join(train_subcategories)}")
        print(f"  Testing subcategories: {', '.join(test_subcategories)}")
        print()
        
    return train_data, test_data

In [116]:
train_data_approach2, test_data_approach2 = split_train_test_approach2(data)

Total data count: 901623
Training data count: 687463 (76.25%)
Testing data count: 214160 (23.75%)
Label: Laptop
  Training subcategories: Dell Laptop, Hp Laptop
  Testing subcategories: Dell Laptop, Hp Laptop, Apple Laptop

Label: Headphone
  Training subcategories: Apple Headphone, Sony Linkbuds Headphone, Bose-1 Headphone, Sony XM4 Headphone, Sony XM3 Headphone, Samsung Headphone, Bose-2 Headphone
  Testing subcategories: Apple Headphone, Sony XM3 Headphone, Sony XM4 Headphone, Bose-2 Headphone, Sony Linkbuds Headphone, Bose-1 Headphone, Samsung Headphone, Beats Headphone

Label: iPad
  Training subcategories: Apple iPad
  Testing subcategories: Apple iPad

Label: Airtag
  Training subcategories: Apple Airtag
  Testing subcategories: Apple Airtag

Label: TV
  Training subcategories: LG TV
  Testing subcategories: LG TV, Samsung TV

Label: Kitchen
  Training subcategories: Kettle, Airfryer
  Testing subcategories: Kettle, Airfryer, Mixer

Label: Smartwatch
  Training subcategories: Ap

SAVE

In [117]:
train_data_approach2.to_csv('csv 20 minutes/labeled2/features_final/together/train_data_approach2.csv', index=False)
test_data_approach2.to_csv('csv 20 minutes/labeled2/features_final/together/test_data_approach2.csv', index=False)

LOAD

In [None]:
train_data_approach2 = pd.read_csv('csv 20 minutes/labeled2/features_final/together/train_data_approach2.csv', delimiter=';', encoding='ISO-8859-1')
test_data_approach2 = pd.read_csv('csv 20 minutes/labeled2/features_final/together/test_data_approach2.csv', delimiter=';', encoding='ISO-8859-1')

In [118]:
## Split data into training and test datasets
X_train2 = train_data_approach2.drop(['label', 'subcategory'], axis=1)
y_train2 = train_data_approach2['label']
X_test2 = test_data_approach2.drop(['label', 'subcategory'], axis=1)
y_test2 = test_data_approach2['label']

SAVE

In [119]:
joblib.dump(X_train2, 'csv 20 minutes/labeled2/features_final/together/X_train2.pkl')
joblib.dump(y_train2, 'csv 20 minutes/labeled2/features_final/together/y_train2.pkl')
joblib.dump(X_test2, 'csv 20 minutes/labeled2/features_final/together/X_test2.pkl')
joblib.dump(y_test2, 'csv 20 minutes/labeled2/features_final/together/y_test2.pkl')

['csv 20 minutes/labeled2/features_final/together/y_test2.pkl']

LOAD

In [None]:
X_train2 = joblib.load('csv 20 minutes/labeled2/features_final/together/X_train2.pkl')
X_test2 = joblib.load('csv 20 minutes/labeled2/features_final/together/X_test2.pkl')
y_train2 = joblib.load('csv 20 minutes/labeled2/features_final/together/y_train2.pkl')
y_test2 = joblib.load('csv 20 minutes/labeled2/features_final/together/y_test2.pkl')