In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

class GeoLifeDataLoader:
    """Load and process Microsoft GeoLife GPS trajectory dataset"""
    
    def __init__(self, data_dir):
        self.data_dir = Path(data_dir)
        self.all_trajectories = []
        
    def load_plt_file(self, filepath):
        """
        Load a single PLT file
        PLT format:
        - First 6 lines are headers
        - Columns: Latitude, Longitude, 0, Altitude, Days, Date, Time
        """
        try:
            # Read PLT file, skip first 6 header lines
            df = pd.read_csv(
                filepath, 
                skiprows=6, 
                header=None,
                names=['latitude', 'longitude', 'zero', 'altitude', 'days', 'date', 'time']
            )
            
            # Combine date and time into datetime
            df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
            
            # Drop unnecessary columns
            df = df[['latitude', 'longitude', 'altitude', 'datetime']]
            
            return df
            
        except Exception as e:
            print(f"Error loading {filepath}: {str(e)}")
            return None
    
    def load_labels(self, user_folder):
        """
        Load labels.txt if it exists for a user
        Format: Start Time, End Time, Transportation Mode
        """
        labels_path = user_folder / 'labels.txt'
        
        if not labels_path.exists():
            return None
        
        try:
            # Read labels file (tab-separated)
            labels = pd.read_csv(
                labels_path,
                sep='\t',
                skiprows=1,  # Skip header
                names=['start_time', 'end_time', 'transportation_mode']
            )
            
            # Convert to datetime
            labels['start_time'] = pd.to_datetime(labels['start_time'])
            labels['end_time'] = pd.to_datetime(labels['end_time'])
            
            return labels
            
        except Exception as e:
            print(f"Error loading labels from {labels_path}: {str(e)}")
            return None
    
    def match_labels_to_trajectory(self, trajectory_df, labels_df):
        """Match transportation mode labels to trajectory points"""
        
        if labels_df is None or labels_df.empty:
            trajectory_df['transportation_mode'] = 'unknown'
            return trajectory_df
        
        # Initialize with 'unknown'
        trajectory_df['transportation_mode'] = 'unknown'
        
        # Match each trajectory point to a label
        for idx, label_row in labels_df.iterrows():
            mask = (
                (trajectory_df['datetime'] >= label_row['start_time']) & 
                (trajectory_df['datetime'] <= label_row['end_time'])
            )
            trajectory_df.loc[mask, 'transportation_mode'] = label_row['transportation_mode']
        
        return trajectory_df
    
    def load_user_trajectories(self, user_id):
        """Load all trajectories for a single user"""
        
        user_folder = self.data_dir / f'{user_id:03d}'
        trajectory_folder = user_folder / 'Trajectory'
        
        if not trajectory_folder.exists():
            return None
        
        # Load labels if available
        labels = self.load_labels(user_folder)
        
        # Get all PLT files
        plt_files = list(trajectory_folder.glob('*.plt'))
        
        if len(plt_files) == 0:
            return None
        
        user_trajectories = []
        
        for plt_file in plt_files:
            # Load trajectory
            traj_df = self.load_plt_file(plt_file)
            
            if traj_df is not None and len(traj_df) > 0:
                # Add user and trajectory IDs
                traj_df['user_id'] = user_id
                traj_df['trajectory_file'] = plt_file.stem
                
                # Match labels if available
                traj_df = self.match_labels_to_trajectory(traj_df, labels)
                
                user_trajectories.append(traj_df)
        
        if len(user_trajectories) > 0:
            return pd.concat(user_trajectories, ignore_index=True)
        else:
            return None
    
    def load_all_trajectories(self, start_user=0, end_user=181):
        """Load trajectories from all users"""
        
        print("\n" + "="*60)
        print(f"LOADING GEOLIFE DATASET: Users {start_user:03d} to {end_user:03d}")
        print("="*60)
        
        all_data = []
        successful_users = 0
        failed_users = 0
        total_points = 0
        
        for user_id in tqdm(range(start_user, end_user + 1), desc="Loading users"):
            
            user_data = self.load_user_trajectories(user_id)
            
            if user_data is not None and len(user_data) > 0:
                all_data.append(user_data)
                successful_users += 1
                total_points += len(user_data)
                
                # Progress update every 20 users
                if (user_id + 1) % 20 == 0:
                    print(f"\n  User {user_id:03d}: {len(user_data):,} points | Total so far: {total_points:,}")
            else:
                failed_users += 1
        
        print("\n" + "="*60)
        print("LOADING COMPLETE")
        print("="*60)
        print(f"Successful users: {successful_users}")
        print(f"Failed/Empty users: {failed_users}")
        print(f"Total trajectory points: {total_points:,}")
        
        if len(all_data) > 0:
            combined_df = pd.concat(all_data, ignore_index=True)
            self.all_trajectories = combined_df
            return combined_df
        else:
            return pd.DataFrame()
    
    def get_dataset_statistics(self, df):
        """Get summary statistics of the dataset"""
        
        print("\n" + "="*60)
        print("DATASET STATISTICS")
        print("="*60)
        
        print(f"\nTotal GPS points: {len(df):,}")
        print(f"Number of users: {df['user_id'].nunique()}")
        print(f"Number of trajectory files: {df['trajectory_file'].nunique()}")
        
        print(f"\nDate range:")
        print(f"  Start: {df['datetime'].min()}")
        print(f"  End: {df['datetime'].max()}")
        print(f"  Duration: {df['datetime'].max() - df['datetime'].min()}")
        
        print(f"\nGeographic extent:")
        print(f"  Latitude: {df['latitude'].min():.6f} to {df['latitude'].max():.6f}")
        print(f"  Longitude: {df['longitude'].min():.6f} to {df['longitude'].max():.6f}")
        print(f"  Altitude: {df['altitude'].min():.2f} to {df['altitude'].max():.2f} meters")
        
        print(f"\nTransportation modes:")
        mode_counts = df['transportation_mode'].value_counts()
        for mode, count in mode_counts.items():
            print(f"  {mode}: {count:,} ({count/len(df)*100:.2f}%)")
        
        print(f"\nPoints per user:")
        user_stats = df.groupby('user_id').size()
        print(f"  Mean: {user_stats.mean():.0f}")
        print(f"  Median: {user_stats.median():.0f}")
        print(f"  Min: {user_stats.min()}")
        print(f"  Max: {user_stats.max()}")
        
        return {
            'total_points': len(df),
            'num_users': df['user_id'].nunique(),
            'num_trajectories': df['trajectory_file'].nunique(),
            'date_range': (df['datetime'].min(), df['datetime'].max()),
            'transportation_modes': mode_counts.to_dict()
        }


# Initialize loader
data_dir = r"c:\Users\subha\Desktop\Projects\dementia_detection\data\microsoft_geolife\Data"
loader = GeoLifeDataLoader(data_dir)

print("\n" + "="*60)
print("MICROSOFT GEOLIFE GPS TRAJECTORY DATASET LOADER")
print("="*60)
print(f"\nData directory: {data_dir}")
print(f"\nThis will load ALL .plt files from 182 user folders (000-181)")
print(f"Each user has multiple trajectory files in their Trajectory/ subfolder")

# Load all trajectories
geolife_data = loader.load_all_trajectories(start_user=0, end_user=181)

# Display results
if not geolife_data.empty:
    print("\n" + "="*60)
    print("SAMPLE DATA")
    print("="*60)
    print("\nFirst 10 rows:")
    print(geolife_data.head(10))
    
    print("\nLast 10 rows:")
    print(geolife_data.tail(10))
    
    print("\nDataFrame Info:")
    print(geolife_data.info())
    
    # Get statistics
    stats = loader.get_dataset_statistics(geolife_data)
    
    # Save to CSV
    output_path = r"c:\Users\subha\Desktop\Projects\dementia_detection\data\geolife_combined_dataset.csv"
    print("\n" + "="*60)
    print("SAVING DATASET")
    print("="*60)
    print(f"Saving to: {output_path}")
    
    geolife_data.to_csv(output_path, index=False)
    print(f"✓ Saved {len(geolife_data):,} GPS points to CSV")
    
    # Save a sample for quick testing
    sample_path = output_path.replace('.csv', '_sample_10k.csv')
    geolife_data.sample(min(10000, len(geolife_data))).to_csv(sample_path, index=False)
    print(f"✓ Saved sample (10k points) to: {sample_path}")
    
    # Save statistics
    stats_path = output_path.replace('.csv', '_statistics.txt')
    with open(stats_path, 'w') as f:
        f.write("GEOLIFE DATASET STATISTICS\n")
        f.write("="*60 + "\n\n")
        f.write(f"Total GPS points: {stats['total_points']:,}\n")
        f.write(f"Number of users: {stats['num_users']}\n")
        f.write(f"Number of trajectories: {stats['num_trajectories']}\n")
        f.write(f"\nDate range: {stats['date_range'][0]} to {stats['date_range'][1]}\n")
        f.write(f"\nTransportation modes:\n")
        for mode, count in stats['transportation_modes'].items():
            f.write(f"  {mode}: {count:,}\n")
    
    print(f"✓ Saved statistics to: {stats_path}")
    
    print("\n" + "="*60)
    print("✓ GEOLIFE DATASET LOADING COMPLETE!")
    print("="*60)
    print(f"\nFiles created:")
    print(f"  1. Full dataset: {output_path}")
    print(f"  2. Sample (10k): {sample_path}")
    print(f"  3. Statistics: {stats_path}")
    
else:
    print("\n❌ No data loaded. Please check the data directory path.")


MICROSOFT GEOLIFE GPS TRAJECTORY DATASET LOADER

Data directory: c:\Users\subha\Desktop\Projects\dementia_detection\data\microsoft_geolife\Data

This will load ALL .plt files from 182 user folders (000-181)
Each user has multiple trajectory files in their Trajectory/ subfolder

LOADING GEOLIFE DATASET: Users 000 to 181


Loading users:  11%|█         | 20/182 [01:31<09:03,  3.35s/it]


  User 019: 47,824 points | Total so far: 4,997,092


Loading users:  22%|██▏       | 40/182 [02:42<08:01,  3.39s/it]


  User 039: 267,737 points | Total so far: 9,206,855


Loading users:  33%|███▎      | 60/182 [03:29<02:11,  1.08s/it]


  User 059: 23,606 points | Total so far: 11,601,490


Loading users:  45%|████▍     | 81/182 [10:51<04:09,  2.47s/it]  


  User 079: 11,243 points | Total so far: 14,348,129


Loading users:  55%|█████▌    | 101/182 [14:46<01:40,  1.24s/it] 


  User 099: 1,267 points | Total so far: 16,021,938


Loading users:  67%|██████▋   | 122/182 [15:26<01:15,  1.25s/it]


  User 119: 103,734 points | Total so far: 16,533,633


Loading users:  77%|███████▋  | 140/182 [30:34<06:00,  8.59s/it]   


  User 139: 1,353 points | Total so far: 18,754,926


Loading users:  88%|████████▊ | 161/182 [47:10<10:01, 28.64s/it]   


  User 159: 38,744 points | Total so far: 22,548,990


Loading users:  99%|█████████▉| 181/182 [1:07:47<00:06,  6.23s/it]   


  User 179: 169,396 points | Total so far: 24,829,125


Loading users: 100%|██████████| 182/182 [1:07:47<00:00, 22.35s/it]



LOADING COMPLETE
Successful users: 182
Failed/Empty users: 0
Total trajectory points: 24,876,978

SAMPLE DATA

First 10 rows:
    latitude   longitude  altitude            datetime  user_id  \
0  39.984702  116.318417     492.0 2008-10-23 02:53:04        0   
1  39.984683  116.318450     492.0 2008-10-23 02:53:10        0   
2  39.984686  116.318417     492.0 2008-10-23 02:53:15        0   
3  39.984688  116.318385     492.0 2008-10-23 02:53:20        0   
4  39.984655  116.318263     492.0 2008-10-23 02:53:25        0   
5  39.984611  116.318026     493.0 2008-10-23 02:53:30        0   
6  39.984608  116.317761     493.0 2008-10-23 02:53:35        0   
7  39.984563  116.317517     496.0 2008-10-23 02:53:40        0   
8  39.984539  116.317294     500.0 2008-10-23 02:53:45        0   
9  39.984606  116.317065     505.0 2008-10-23 02:53:50        0   

  trajectory_file transportation_mode  
0  20081023025304             unknown  
1  20081023025304             unknown  
2  200810230253