In [1]:
import pandas as pd
import os

def split_csv_by_track_id(input_csv_path, output_folder_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)
    
    # Ensure the output folder exists
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    # Group by 'track_id' and write separate CSVs
    for track_id, group_df in df.groupby('track_id'):
        output_path = os.path.join(output_folder_path, f"{track_id}.csv")
        group_df.to_csv(output_path, index=False)
        print(f"File saved: {output_path}")

# Example usage
input_csv_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\GPSTrajectory\\go_track_trackspoints.csv' # Update this path
output_folder_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\dataset_processed' # Update this path

split_csv_by_track_id(input_csv_path, output_folder_path)


File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\1.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\2.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\3.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\4.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\8.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\10.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\11.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\12.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\13.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\14.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\16.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processed\17.csv
File saved: C:\Users\ss6365\Desktop\Datasets\UCI\gps\dataset_processe

In [2]:
import pandas as pd


# Path to the CSV file
file_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\GPSTrajectory\\go_track_trackspoints.csv'  # Update this path to your file's location

# Load the dataset
df = pd.read_csv(file_path)

# Convert 'time' column to datetime without specifying the format
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Calculate the minimum and maximum dates in the 'time' column
min_date = df['time'].min()
max_date = df['time'].max()

# Print the results
print(f"The earliest date of data logging is: {min_date}")
print(f"The latest date of data logging is: {max_date}")



The earliest date of data logging is: 2014-09-13 07:24:32
The latest date of data logging is: 2016-01-19 13:01:47


In [3]:
df = pd.read_csv(file_path)
df

Unnamed: 0,id,latitude,longitude,track_id,time
0,1,-10.939341,-37.062742,1,2014-09-13 07:24:32
1,2,-10.939341,-37.062742,1,2014-09-13 07:24:37
2,3,-10.939324,-37.062765,1,2014-09-13 07:24:42
3,4,-10.939211,-37.062843,1,2014-09-13 07:24:47
4,5,-10.938939,-37.062879,1,2014-09-13 07:24:53
...,...,...,...,...,...
18102,19565,-10.923722,-37.106579,38092,2016-01-19 13:01:01
18103,19566,-10.923704,-37.106693,38092,2016-01-19 13:01:12
18104,19567,-10.923715,-37.106688,38092,2016-01-19 13:01:24
18105,19568,-10.923715,-37.106688,38092,2016-01-19 13:01:36


In [4]:
df['latitude'].median()

-10.9219998

In [5]:
df['longitude'].median()

-37.05778433

In [6]:
import pandas as pd
import folium
from folium.plugins import HeatMap

# Load your CSV file
csv_file_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\GPSTrajectory\\go_track_trackspoints.csv'
df = pd.read_csv(csv_file_path)

# Assuming your columns are named 'Latitude' and 'Longitude'
latitude = 'latitude'
longitude = 'longitude'

# Create a map centered around the average location
map_center = [df[latitude].mean(), df[longitude].mean()]
m = folium.Map(location=map_center, zoom_start=10)

# Create a HeatMap layer and add it to the map
heat_data = [[row[latitude], row[longitude]] for index, row in df.iterrows()]
HeatMap(heat_data).add_to(m)

# Save the map to an HTML file
output_html = 'C:\\Users\\ss6365\\Desktop\\AR_GPS_Sensor_Data\\map_uci.html'
m.save(output_html)

print(f"Heatmap saved to {output_html}")

Heatmap saved to C:\Users\ss6365\Desktop\AR_GPS_Sensor_Data\map_uci.html


In [16]:
import os
import pandas as pd
import numpy as np

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees).
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def calculate_distance_for_file(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Calculate the distances
    distances = [
        haversine(lon1, lat1, lon2, lat2) 
        for lat1, lon1, lat2, lon2 in zip(df['latitude'][:-1], df['longitude'][:-1], df['latitude'][1:], df['longitude'][1:])
    ]
    
    # Sum the distances
    return sum(distances)

def calculate_cumulative_distance(directory):
    total_distance = 0
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            total_distance += calculate_distance_for_file(file_path)
    return total_distance

# Example usage
directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\dataset_processed'  # Replace this with the path to your directory
cumulative_distance = calculate_cumulative_distance(directory)
print(f"Cumulative Distance Traversed: {cumulative_distance} km")

Cumulative Distance Traversed: 868.7213830764748 km


In [19]:
import os
import pandas as pd
import numpy as np

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees).
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def calculate_distance_for_file(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Calculate the distances
    distances = [
        haversine(lon1, lat1, lon2, lat2) 
        for lat1, lon1, lat2, lon2 in zip(df['latitude'][:-1], df['longitude'][:-1], df['latitude'][1:], df['longitude'][1:])
    ]
    
    # Sum the distances
    return sum(distances), len(df)

def calculate_cumulative_distance_and_row_count(directory):
    
    total_distance = 0
    total_row_count = 0
    
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            distance, row_count = calculate_distance_for_file(file_path)
            total_distance += distance
            total_row_count += row_count

    average_distance =  (total_distance/ total_row_count)*1000
    return total_distance, average_distance

# Example usage
directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\uci\\data\\utility'  # Replace this with the path to your directory
cumulative_distance, average_distance = calculate_cumulative_distance_and_row_count(directory)
print(f"Cumulative Distance Traversed: {cumulative_distance} km")
print(f"Average Distance: {average_distance} m")


Cumulative Distance Traversed: 868.7213830764748 km
Average Distance: 47.97710184329126 m


In [9]:
import pandas as pd

# Load the CSV file
file_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\dataset_processed\\1.csv'  # Update this path to your file's location
df = pd.read_csv(file_path)


df_new = pd.read_csv(file_path)

# Convert 'time' column to datetime
df_new['time'] = pd.to_datetime(df_new['time'])

# Calculate the time differences (intervals) between consecutive rows
df_new['time_diff'] = df_new['time'].diff()

# Display the first few rows to see the 'time_diff' column
df_new.head()



Unnamed: 0,id,latitude,longitude,track_id,time,time_diff
0,1,-10.939341,-37.062742,1,2014-09-13 07:24:32,NaT
1,2,-10.939341,-37.062742,1,2014-09-13 07:24:37,0 days 00:00:05
2,3,-10.939324,-37.062765,1,2014-09-13 07:24:42,0 days 00:00:05
3,4,-10.939211,-37.062843,1,2014-09-13 07:24:47,0 days 00:00:05
4,5,-10.938939,-37.062879,1,2014-09-13 07:24:53,0 days 00:00:06


In [10]:


# Load your CSV
file_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\dataset_processed\\1.csv'  # Update this path to your file's location
df = pd.read_csv(file_path)

# Convert the 'time' column to datetime
df['time'] = pd.to_datetime(df['time'])

# Ensure the data is sorted by 'time'
df = df.sort_values(by='time')

# Calculate differences (intervals) between each timestamp and the previous one
df['sampling_interval'] = df['time'].diff()

# Convert intervals to a consistent unit, e.g., seconds
df['sampling_interval_seconds'] = df['sampling_interval'].dt.total_seconds()

# Calculate the average sampling interval
average_sampling_interval = df['sampling_interval_seconds'].mean()

print(f"The average sampling interval is: {average_sampling_interval} seconds")

The average sampling interval is: 5.573033707865169 seconds


In [11]:
import pandas as pd
import os
import numpy as np

# Specify the directory containing your CSV files
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\dataset_processed'  # Update this path

# Initialize a list to store the results
average_intervals = []

# Sum of all average intervals
total_average_interval = 0

# Count of files processed for the overall average calculation
file_count = 0

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    # Check if the file is a CSV
    if file_name.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory_path, file_name)
        
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Convert the 'time' column to datetime
        df['time'] = pd.to_datetime(df['time'])
        
        # Ensure the data is sorted by 'time'
        df = df.sort_values(by='time')
        
        # Calculate differences (intervals) between each timestamp
        df['sampling_interval_seconds'] = df['time'].diff().dt.total_seconds()
        
        # Ignore NaN values for the average calculation
        valid_intervals = df['sampling_interval_seconds'].dropna()
        
        if not valid_intervals.empty:
            # Calculate the average sampling interval
            average_sampling_interval = valid_intervals.mean()
            
            # Add the result to the list for individual file averages
            average_intervals.append((file_name, average_sampling_interval))
            
            # Update the total average and file count
            total_average_interval += average_sampling_interval
            file_count += 1
        else:
            # Handle files with insufficient data for interval calculation
            average_intervals.append((file_name, 'Insufficient data for interval calculation'))

# Calculate the overall average sampling interval across all files
overall_average_interval = total_average_interval / file_count if file_count > 0 else 'No valid data found in any file'

# Print the list of average sampling intervals for each file
for file_name, interval in average_intervals:
    print(f"{file_name}: {interval} seconds")

# Print the overall average sampling interval
print(f"\nOverall average sampling interval across all files: {overall_average_interval} seconds")


1.csv: 5.573033707865169 seconds
10.csv: 5.0 seconds
11.csv: 7.5514018691588785 seconds
12.csv: 10.615384615384615 seconds
128.csv: 10.422222222222222 seconds
13.csv: 10.564245810055866 seconds
131.csv: 5.791139240506329 seconds
132.csv: 5.701408450704226 seconds
133.csv: 5.720394736842105 seconds
134.csv: 5.32013201320132 seconds
135.csv: 5.32013201320132 seconds
136.csv: 5.518518518518518 seconds
137.csv: 5.518518518518518 seconds
138.csv: 5.544117647058823 seconds
139.csv: 5.544117647058823 seconds
14.csv: Insufficient data for interval calculation seconds
140.csv: 5.530612244897959 seconds
141.csv: 5.530612244897959 seconds
142.csv: 5.522875816993464 seconds
143.csv: 5.522875816993464 seconds
145.csv: 5.328947368421052 seconds
146.csv: 5.5 seconds
147.csv: 5.990430622009569 seconds
148.csv: Insufficient data for interval calculation seconds
149.csv: 6.186813186813187 seconds
150.csv: 5.738938053097345 seconds
151.csv: 5.547038327526132 seconds
153.csv: 5.582089552238806 seconds
155

In [12]:
import pandas as pd
import os

# Specify the directory containing your CSV files
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\UCI\\gps\\dataset_processed'  # Update this path to your directory

# Column names
old_column_name = 'track_id'  # The original name of the column you want to rename
new_column_name = 'identifier'  # The new name for the column

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    # Check if the file is a CSV
    if file_name.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory_path, file_name)
        
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Check if the column to be renamed exists in the DataFrame
        if old_column_name in df.columns:
            # Rename the column
            df.rename(columns={old_column_name: new_column_name}, inplace=True)
            
            # Save the modified DataFrame back to the CSV, overwriting the original file
            df.to_csv(file_path, index=False)
            print(f"Column renamed in {file_name}")
        else:
            print(f"Column '{old_column_name}' not found in {file_name}")

print("Done processing all files.")


Column renamed in 1.csv
Column renamed in 10.csv
Column renamed in 11.csv
Column renamed in 12.csv
Column renamed in 128.csv
Column renamed in 13.csv
Column renamed in 131.csv
Column renamed in 132.csv
Column renamed in 133.csv
Column renamed in 134.csv
Column renamed in 135.csv
Column renamed in 136.csv
Column renamed in 137.csv
Column renamed in 138.csv
Column renamed in 139.csv
Column renamed in 14.csv
Column renamed in 140.csv
Column renamed in 141.csv
Column renamed in 142.csv
Column renamed in 143.csv
Column renamed in 145.csv
Column renamed in 146.csv
Column renamed in 147.csv
Column renamed in 148.csv
Column renamed in 149.csv
Column renamed in 150.csv
Column renamed in 151.csv
Column renamed in 153.csv
Column renamed in 155.csv
Column renamed in 156.csv
Column renamed in 157.csv
Column renamed in 158.csv
Column renamed in 159.csv
Column renamed in 16.csv
Column renamed in 17.csv
Column renamed in 171.csv
Column renamed in 173.csv
Column renamed in 177.csv
Column renamed in 179

In [None]:

############# Data Security ################

In [20]:
import os
import pandas as pd


# Input directory containing CSV files
input_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\uci\\data\\utility'

# Output directory where the merged CSV file will be saved
output_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\uci\\data'



# # List of important columns to keep
important_columns = ['identifier', 'longitude', 'latitude']  # Replace with your column names

# # Create a list to store dataframes from individual CSV files
dataframes = []

# # Iterate through CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        df = pd.read_csv(file_path, usecols=important_columns)
        dataframes.append(df)

# # Concatenate dataframes vertically (along rows)
merged_df = pd.concat(dataframes, ignore_index=True)

# # Output file path for the merged CSV file
output_file_path = os.path.join(output_directory, 'merged_all_utility_subset.csv')

# # Save the merged dataframe to a CSV file
merged_df.to_csv(output_file_path, index=False)

print(f"Merged CSV file saved to {output_file_path}")

Merged CSV file saved to C:\Users\ss6365\Desktop\location_privacy_final\uci\data\merged_all_utility_subset.csv


In [21]:
import numpy as np


df = pd.read_csv(r'C:\Users\ss6365\Desktop\location_privacy_final\uci\data\merged_all_utility_subset.csv')

def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in km
    R = 6371.0
    # Convert coordinates from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def find_square_boundaries(lat, lon, distance_km):
    # Approximate conversions
    delta_lat = distance_km / 111  # 111 km per degree of latitude
    delta_lon = distance_km / (111 * np.cos(np.radians(lat)))  # Adjust for longitude
    return lat - delta_lat, lat + delta_lat, lon - delta_lon, lon + delta_lon


# Calculate the median (or mean) latitude and longitude
central_lat = df['latitude'].median()
central_lon = df['longitude'].median()


# Define the square region boundaries
lat_min, lat_max, lon_min, lon_max = find_square_boundaries(central_lat, central_lon, 2)

# Filter the DataFrame for points within the 1 km square
df_limit = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max) &
               (df['longitude'] >= lon_min) & (df['longitude'] <= lon_max)]

df_limit

Unnamed: 0,latitude,longitude,identifier
0,-10.939341,-37.062742,1
1,-10.939341,-37.062742,1
2,-10.939324,-37.062765,1
3,-10.939211,-37.062843,1
4,-10.938939,-37.062879,1
...,...,...,...
17999,-10.925716,-37.075154,58
18000,-10.925418,-37.076027,58
18001,-10.925417,-37.076027,58
18002,-10.925418,-37.076027,58


In [22]:
import glob
import os
import pandas as pd

# Input and output directories
input_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\uci\data\utility'
output_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\uci\data\security'


# Calculate the boundaries based on the current file
lat_min = df_limit['latitude'].min()
lat_max = df_limit['latitude'].max()
lon_min = df_limit['longitude'].min()
lon_max = df_limit['longitude'].max()


# Iterate through CSV files in the input directory
for csv_file in glob.glob(os.path.join(input_directory, '*.csv')):
    # Load the CSV file
    df = pd.read_csv(csv_file)



    # Distance parameter (can be adjusted as needed)
    distance_km = 2

    # Define the square region boundaries and filter the DataFrame
    df_square = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max) &
                   (df['longitude'] >= lon_min) & (df['longitude'] <= lon_max)]

    # Check if the filtered DataFrame is empty (no data within boundaries)
    if df_square.empty:
        continue  # Skip saving if no data matches the criteria

    # Extract the base filename without extension
    base_filename = os.path.splitext(os.path.basename(csv_file))[0]
    
    # Create the new filename with distance_km
    new_filename = f"{base_filename}_{distance_km}km.csv"
    
    # Save the filtered DataFrame to the output directory with the new filename
    output_path = os.path.join(output_directory, new_filename)
    df_square.to_csv(output_path, index=False)

print("Processing complete.")

Processing complete.
