In [8]:
import os
import pandas as pd

# Specify the directory containing the CSV files
directory = r'C:\Users\ss6365\Desktop\location_privacy_final\tracebased\perturbed_averaged_10\laplace\0.1'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an identifier starting from 1
identifier = 1

# Iterate over each CSV file
for file_name in csv_files:
    # Construct the full file path
    file_path = os.path.join(directory, file_name)
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Add a new column 'identifier' with all rows having the same identifier
    df['identifier'] = identifier
    
    # Save the modified DataFrame back to the same CSV file
    df.to_csv(file_path, index=False)
    
    # Increment the identifier for the next file
    identifier += 1

print('All files have been processed and updated with an identifier.')


All files have been processed and updated with an identifier.


In [9]:
import os
import pandas as pd

# Input directory containing CSV files
input_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\tracebased\perturbed_averaged_10\laplace\0.1'

# Output directory where the merged CSV file will be saved
output_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\tracebased\perturbed_averaged_10\laplace'



# # List of important columns to keep
important_columns = ['latitude', 'longitude', 'perturbed_latitude','perturbed_longitude', 'identifier']  # Replace with your column names

# # Create a list to store dataframes from individual CSV files
dataframes = []

# # Iterate through CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        df = pd.read_csv(file_path, usecols=important_columns)
        dataframes.append(df)

# # Concatenate dataframes vertically (along rows)
merged_df = pd.concat(dataframes, ignore_index=True)

# # Output file path for the merged CSV file
output_file_path = os.path.join(output_directory, 'merged_laplace_0.1.csv')

# # Save the merged dataframe to a CSV file
merged_df.to_csv(output_file_path, index=False)

print(f"Merged CSV file saved to {output_file_path}")

Merged CSV file saved to C:\Users\ss6365\Desktop\location_privacy_final\tracebased\perturbed_averaged_10\laplace\merged_laplace_0.1.csv


In [14]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def process_directory(file_directory, use_alternate_columns=False, num_runs=1):
    all_files = os.listdir(file_directory)
    csv_files = [file for file in all_files if file.endswith('.csv')]
    csv_files.sort()

    trajectory_accuracy_totals = [0] * len(csv_files)

    for _ in range(num_runs):
        for idx, file_name in enumerate(csv_files):
            file_path = os.path.join(file_directory, file_name)
            data = pd.read_csv(file_path)

            # Assume each trajectory is identified by a 'trajectory_id' column
            if use_alternate_columns:
                observations = data[['reported_lat', 'reported_lon']]
            else:
                observations = data[['perturbed_latitude', 'perturbed_longitude']]
            
            secrets = data['location_id']
            trajectories = data['identifier']

            # Split data by trajectory
            trajectory_groups = data.groupby('identifier')
            correct_trajectories = 0
            total_trajectories = len(trajectory_groups)

            for trajectory_id, group in trajectory_groups:
                X = group[['perturbed_latitude', 'perturbed_longitude']]
                y = group['location_id']

                # Splitting the data from each trajectory for cross-validation or simple train-test
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

                knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
                knn.fit(X_train, y_train)
                predicted_locations = knn.predict(X_test)

                # Evaluate accuracy per trajectory
                if accuracy_score(y_test, predicted_locations) > 0.66:  # 2 out of 3 points correctly predicted
                    correct_trajectories += 1

            trajectory_accuracy = (correct_trajectories / total_trajectories) * 100
            trajectory_accuracy_totals[idx] += trajectory_accuracy

    # Calculate the average accuracy across runs
    trajectory_accuracy_avg = [x / num_runs for x in trajectory_accuracy_totals]
    
    return trajectory_accuracy_avg

# Example usage:
file_directory = r"C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\attack1\laplace\200"
trajectory_accuracies = process_directory(file_directory)
print("Trajectory Accuracies:", trajectory_accuracies)


KeyboardInterrupt: 

In [21]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def process_directory(file_directory, num_runs=1):
    all_files = os.listdir(file_directory)
    csv_files = [os.path.join(file_directory, file) for file in all_files if file.endswith('.csv')]
    csv_files.sort()

    # Split the list of files into training and testing subsets
    train_files, test_files = train_test_split(csv_files, test_size=0.001, random_state=42)

    # Train the model on the training set
    knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
    for file_path in train_files:
        data = pd.read_csv(file_path)
        X_train = data[['perturbed_latitude', 'perturbed_longitude']]
        y_train = data['location_id']
        knn.fit(X_train, y_train)

    # Evaluate on the testing set
    correctly_predicted_trajectories = 0

    for file_path in test_files:
        data = pd.read_csv(file_path)
        X_test = data[['perturbed_latitude', 'perturbed_longitude']]
        
        y_test = data['location_id']
        #print(y_test)
        predicted_locations = knn.predict(X_test)
        print(predicted_locations)
        # Calculate accuracy and check if more than 50% of the predictions are correct
        accuracy = accuracy_score(y_test, predicted_locations)
        print(accuracy)
        if accuracy >= 0.1:
            correctly_predicted_trajectories += 1

    # Calculate the percentage of correctly predicted trajectories
    total_test_files = len(test_files)
    prediction_accuracy = (correctly_predicted_trajectories / total_test_files) * 100  # Convert to percentage
    return prediction_accuracy

# Example usage
file_directory = r"C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split"
average_trajectory_accuracy = process_directory(file_directory)
print("Average Trajectory Accuracy:", average_trajectory_accuracy)


[187 187 187 187 187 187 187 187 187 187]
0.0
[187 187 187 187 187 187 187 187 187 187]
1.0
[187 187 187 187 187 187 187 187 187 187]
0.0
[187 187 187 187 187 187 187 187 187 187]
0.0
Average Trajectory Accuracy: 25.0


In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [7]:
file_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split'

all_files = os.listdir(file_directory)
csv_files = [os.path.join(file_directory, file) for file in all_files if file.endswith('.csv')]
csv_files.sort()

    # Split the list of files into training and testing subsets
train_files, test_files = train_test_split(csv_files, test_size=0.01, random_state=42)

In [8]:
test_files

['C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\3204.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\2277.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\465.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\2754.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\1856.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\260.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\2149.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\1590.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\split\\1288.csv',
 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tracebased\\machine_learning\\

In [29]:
import pandas as pd
import os

def split_csv_by_identifier(input_directory, input_filename, output_directory):
    # Construct the full path to the input file
    input_file_path = os.path.join(input_directory, input_filename)

    # Read the CSV file
    df = pd.read_csv(input_file_path)

    # Ensure the output directory exists, if not, create it
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Split the dataframe by 'identifier' and write to separate files
    for identifier, group in df.groupby('identifier'):
        output_file_path = os.path.join(output_directory, f"{identifier}.csv")
        group.to_csv(output_file_path, index=False)
        print(f"File saved: {output_file_path}")

# Example usage:
split_csv_by_identifier(r'C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\attack1\laplace\200', r'merged_laplace_0.1_encoded_200.csv', r'C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split')


File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\1.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\2.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\3.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\4.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\5.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\6.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\7.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\8.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\9.csv
File saved: C:\Users\ss6365\Desktop\location_privacy_final\tracebased\machine_learning\split\10.csv
File save

In [48]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

def load_and_preprocess_data(file_directory):
    # Load all CSV files and concatenate into a single DataFrame
    files = [os.path.join(file_directory, f) for f in os.listdir(file_directory) if f.endswith('.csv')]
    data_frames = [pd.read_csv(f) for f in files]
    data = pd.concat(data_frames, ignore_index=True)
    
    # Assuming 'perturbed_latitude' and 'perturbed_longitude' are the inputs and 'location_id' is the target
    X = data[['perturbed_latitude', 'perturbed_longitude']].values
    y = data['location_id'].values
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Reshape data into sequences (example: sequences of 10 points)
    seq_length = 20
    X_seq = []
    y_seq = []
    for i in range(len(data) - seq_length + 1):
        X_seq.append(X_scaled[i:i + seq_length])
        y_seq.append(y[i + seq_length - 1])  # Label for the last item in the sequence
    
    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)
    
    # One-hot encode the labels
    y_seq_encoded = to_categorical(y_seq)
    
    return X_seq, y_seq_encoded

def build_lstm_model(input_shape, num_classes):
    model = Sequential([
        LSTM(100, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(100),
        Dropout(0.2),
        Dense(100, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def main():
    file_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\collected\machine_learning\attack20\laplace'  # Update the path to your dataset
    X_seq, y_seq_encoded = load_and_preprocess_data(file_directory)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq_encoded, test_size=0.2, random_state=42)
    
    # Build the LSTM model
    input_shape = (X_train.shape[1], X_train.shape[2])
    num_classes = y_train.shape[1]
    model = build_lstm_model(input_shape, num_classes)
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
    
    # Evaluate the model
    test_loss, test_acc = model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {test_acc*100:.2f}%')

if __name__ == '__main__':
    main()


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 44.23%


In [43]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

def load_and_preprocess_data(file_directory):
    # Load all CSV files and concatenate into a single DataFrame
    files = [os.path.join(file_directory, f) for f in os.listdir(file_directory) if f.endswith('.csv')]
    data_frames = [pd.read_csv(f) for f in files]
    data = pd.concat(data_frames, ignore_index=True)
    
    # Assuming 'perturbed_latitude' and 'perturbed_longitude' are the inputs and 'location_id' is the target
    X = data[['perturbed_latitude', 'perturbed_longitude']].values
    y = data['location_id'].values
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Reshape data into sequences (example: sequences of 10 points)
    seq_length = 30
    X_seq = []
    y_seq = []
    for i in range(len(data) - seq_length + 1):
        X_seq.append(X_scaled[i:i + seq_length])
        y_seq.append(y[i + seq_length - 1])  # Label for the last item in the sequence
    
    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)
    
    # One-hot encode the labels
    y_seq_encoded = to_categorical(y_seq)
    
    return X_seq, y_seq_encoded

def build_lstm_model(input_shape, num_classes):
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(50),
        Dropout(0.2),
        Dense(100, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def main():
    file_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\collected\machine_learning\attack30\laplace'  # Update the path to your dataset
    X_seq, y_seq_encoded = load_and_preprocess_data(file_directory)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq_encoded, test_size=0.2, random_state=42)
    
    # Build the LSTM model
    input_shape = (X_train.shape[1], X_train.shape[2])
    num_classes = y_train.shape[1]
    model = build_lstm_model(input_shape, num_classes)
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
    
    # Evaluate the model
    test_loss, test_acc = model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {test_acc*100:.2f}%')

if __name__ == '__main__':
    main()


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 47.39%


In [49]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

def load_and_preprocess_data(file_directory):
    # Load all CSV files and concatenate into a single DataFrame
    files = [os.path.join(file_directory, f) for f in os.listdir(file_directory) if f.endswith('.csv')]
    data_frames = [pd.read_csv(f) for f in files]
    data = pd.concat(data_frames, ignore_index=True)
    
    # Assuming 'perturbed_latitude' and 'perturbed_longitude' are the inputs and 'location_id' is the target
    X = data[['perturbed_latitude', 'perturbed_longitude']].values
    y = data['location_id'].values
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Reshape data into sequences (example: sequences of 10 points)
    seq_length = 10
    X_seq = []
    y_seq = []
    for i in range(len(data) - seq_length + 1):
        X_seq.append(X_scaled[i:i + seq_length])
        y_seq.append(y[i + seq_length - 1])  # Label for the last item in the sequence
    
    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)
    
    # One-hot encode the labels
    y_seq_encoded = to_categorical(y_seq)
    
    return X_seq, y_seq_encoded

def build_lstm_model(input_shape, num_classes):
    model = Sequential([
        LSTM(100, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(50),
        Dropout(0.2),
        Dense(100, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def main():
    file_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\collected\machine_learning\attack10\laplace'  # Update the path to your dataset
    X_seq, y_seq_encoded = load_and_preprocess_data(file_directory)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq_encoded, test_size=0.2, random_state=42)
    
    # Build the LSTM model
    input_shape = (X_train.shape[1], X_train.shape[2])
    num_classes = y_train.shape[1]
    model = build_lstm_model(input_shape, num_classes)
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
    
    # Evaluate the model
    test_loss, test_acc = model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {test_acc*100:.2f}%')

if __name__ == '__main__':
    main()


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 41.49%
