In [None]:
import os
import csv
import time

def preprocessing():
    params = {
        'path': '/content/dataset_TSMC2014_NYC.csv',
        'dataname': 'NYC',
        'filetype': 'csv',
        'user_po': 'userId',
        'loc_po': 'venueId',
        'tim_po': 'utcTimestamp',
        'user_record_min': 10,
        'loc_record_min': 10
    }

    # Loading and Filtering sparse data
    print('='*20, 'Loading and preprocessing sparse data')

    # Specify the path of the dataset
    filepath = f'/content/dataset_TSMC2014_NYC.csv'
    print(f'Path is {filepath}')

    loc_count = {}  # store location info with loc-num
    user_count = {}  # store user info with user-num
    user_id = {}
    loc_id = {}

    # Load file and count numbers
    print('='*20, 'Loading and Counting')
    if params['filetype'] == 'csv':
        with open(filepath, 'r', encoding='latin-1') as f:
            reader = csv.DictReader(f)
            for record in reader:
                user = record[params['user_po']]
                loc = record[params['loc_po']]

                # Count the occurrences of each user
                if user not in user_count:
                    user_count[user] = 1
                else:
                    user_count[user] += 1

                # Count the occurrences of each location
                if loc not in loc_count:
                    loc_count[loc] = 1
                else:
                    loc_count[loc] += 1

    record_num = sum(user_count.values())
    print(f'Finished, records: {record_num}, unique users: {len(user_count)}, unique locations: {len(loc_count)}')

    # Filter and encode user and location
    print('='*20, 'Filtering and encoding')

    # Filter users based on the minimum number of records
    for user in user_count:
        if user_count[user] > params['user_record_min']:
            user_id[user] = len(user_id)

    # Filter locations based on the minimum number of records
    for loc in loc_count:
        if loc_count[loc] > params['loc_record_min']:
            loc_id[loc] = len(loc_id)

    filter_path = f'/content/dataset_TSMC2014_NYC_filtered.csv'
    print(f'Filter path is {filter_path}')

    # Write the filtered records to a new file
    with open(filter_path, 'w', encoding='latin-1', newline='') as f_out:
        fieldnames = [params['user_po'], params['loc_po']] + [col for col in reader.fieldnames if col not in [params['user_po'], params['loc_po']]]
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()
        with open(filepath, 'r', encoding='latin-1') as f_in:
            reader = csv.DictReader(f_in)
            for record in reader:
                user = record[params['user_po']]
                loc = record[params['loc_po']]

                # If the user and location are in the filtered list, encode them with unique IDs
                if user in user_id and loc in loc_id:
                    record[params['user_po']] = user_id[user]
                    record[params['loc_po']] = loc_id[loc]
                    writer.writerow(record)

    record_num = sum(1 for _ in open(filter_path, 'r'))
    print(f'Finished, records: {record_num}, unique users: {len(user_id)}, unique locations: {len(loc_id)}')

    # Merge data
    print('='*20, 'Merging')

    merge_path = f'/content/dataset_TSMC2014_NYC_merged.csv'
    print(f'Merge path is {merge_path}')

    # Write the merged records to a new file
    with open(merge_path, 'w', encoding='latin-1', newline='') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()
        with open(filter_path, 'r', encoding='latin-1') as f_in:
            reader = csv.DictReader(f_in)
            pre_record = next(reader)
            writer.writerow(pre_record)
            for record in reader:

                # Skip the record if it has the same user, location, and date as the previous record
                if record[params['user_po']] == pre_record[params['user_po']] and \
                   record[params['loc_po']] == pre_record[params['loc_po']] and \
                   record[params['tim_po']].split('T')[0] == pre_record[params['tim_po']].split('T')[0]:
                    continue

                writer.writerow(record)
                pre_record = record

    record_num = sum(1 for _ in open(merge_path, 'r'))
    print(f'Finished, records: {record_num}')


start_time = time.time()
preprocessing()
print('Time cost:', f'{time.time()-start_time:.0f}')


Time cost: 0


Initialize dictionaries to store location and user information.
Load the file and count the occurrences of each user and location.Filter users and locations based on the minimum number of records specified in the parameters.Write the filtered records to a new file, encoding the user and location with unique IDs.Write the merged records to a new file, skipping records that have the same user, location, and date as the previous record.

In [None]:
import datetime
import argparse
import csv
import numpy as np
import pickle
from math import pi
from collections import Counter

class DataGeneration(object):
    def __init__(self, params):
        self.__dict__.update(params.__dict__)

        self.raw_data = {}     # raw user's trajectory. {uid: [[pid, tim], ...]}
        self.poi_count = {}   # raw location counts. {pid: count}
        self.data_filtered = {}
        self.uid_list = []   # filtered user id
        self.pid_dict = {}   # filtered location id map
        self.train_data = {} # train data with history,   {'uid': {'sid': {'loc': [], 'tim': [], 'target': [] (, 'cat': [])}}}
        self.train_id = {}   # train data session id list
        self.test_data = {}
        self.test_id = {}
        self.tim_w = set()
        self.tim_h = set()

        self.raw_lat_lon = {} # count for latitude and longitude
        self.new_lat_lon = {}
        self.lat_lon_radians = {}


        self.raw_cat_dict = {}  #  cid-cat
        self.new_cat_dict = {}
        self.pid_cat_dict = {}    # pid-cat dict

    # 1. read trajectory data
    def load_trajectory(self):
        with open('/content/dataset_TSMC2014_NYC_merged.csv', 'r') as csv_file:
            reader = csv.reader(csv_file)
            for i, row in enumerate(reader):
                userId, venueId, venueCategoryId, venueCategory, latitude, longitude, _, timezoneOffset = row
                if self.cat_contained:
                    # count uid records
                    if userId not in self.raw_data:
                        self.raw_data[userId] = [[venueId, timezoneOffset, venueCategory]]
                    else:
                        self.raw_data[userId].append([venueId, timezoneOffset, venueCategory])
                    # count raw_venueCategoryId-cat
                    if venueCategoryId not in self.raw_cat_dict:
                        self.raw_cat_dict[venueCategoryId] = venueCategory
                else:
                    if userId not in self.raw_data:
                        self.raw_data[userId] = [[venueId, timezoneOffset]]
                    else:
                        self.raw_data[userId].append([venueId, timezoneOffset])
                if venueId not in self.poi_count:
                    self.poi_count[venueId] = 1
                else:
                    self.poi_count[venueId] += 1

                # count poi latitude and longitude
                if venueId not in self.raw_lat_lon:
                    self.raw_lat_lon[venueId] = [eval(latitude), eval(longitude)]

    # 2. filter users and locations, and then split trajectory into sessions
    def filter_and_divide_sessions(self):
        POI_MIN_RECORD_FOR_USER = 1  # keep same setting with DeepMove and LSTPM

        # filter user and location
        uid_list = [x for x in self.raw_data if len(self.raw_data[x]) > self.user_record_min]      # uid list
        pid_list = [x for x in self.poi_count if self.poi_count[x] > self.poi_record_min]  # pid list

        # iterate each user
        for uid in uid_list:
            user_records = self.raw_data[uid]
            user_records.sort(key=lambda x: x[1])  # sort records by timestamp

            valid_session_flag = False   # session validation flag
            valid_sessions = []   # valid sessions
            sessions = []

            for i in range(len(user_records) - 1):
                # divide sessions by time
                current_record = user_records[i]
                next_record = user_records[i+1]
                sessions.append(current_record)

                # calculate time interval between two consecutive records
                current_time = datetime.datetime.strptime(current_record[1], self.time_format)
                next_time = datetime.datetime.strptime(next_record[1], self.time_format)
                time_interval = (next_time - current_time).total_seconds() / 60  # convert to minutes

                # split sessions when time interval exceeds session threshold
                if time_interval > self.session_threshold:
                    sessions.append(next_record)
                    if len(sessions) >= self.session_minlen:
                        valid_session_flag = True
                        valid_sessions.extend(sessions)
                    sessions = []

            # check if the last session is valid
            last_record = user_records[-1]
            if len(sessions) == 0 or sessions[-1] != last_record:
                sessions.append(last_record)
                if len(sessions) >= self.session_minlen:
                    valid_session_flag = True
                    valid_sessions.extend(sessions)

            if valid_session_flag:
                self.data_filtered[uid] = valid_sessions

        # generate filtered user list
        self.uid_list = list(self.data_filtered.keys())

        # generate location id map
        for i, pid in enumerate(pid_list):
            self.pid_dict[pid] = i + 1

        # re-encode location indices in the filtered data
        for uid, sessions in self.data_filtered.items():
            filtered_sessions = []
            for session in sessions:
                pid = session[0]
                if pid in self.pid_dict:
                    session[0] = self.pid_dict[pid]
                    filtered_sessions.append(session)
            if filtered_sessions:
                self.data_filtered[uid] = filtered_sessions

        # generate pid-cat mapping dict
        for pid, cat in self.raw_cat_dict.items():
            self.pid_cat_dict[pid] = cat

        # update poi_count based on filtered data
        self.poi_count = Counter([record[0] for sessions in self.data_filtered.values() for record in sessions])

    # 3. preprocess data and split into train/test sets
    def preprocess_and_split(self):
        # calculate latitude and longitude ranges
        lat_list = []
        lon_list = []

        for lat, lon in self.raw_lat_lon.values():
            if isinstance(lat, (float, int)) and isinstance(lon, (float, int)):
                lat_list.append(lat)
                lon_list.append(lon)

        lat_min, lat_max = min(lat_list), max(lat_list)
        lon_min, lon_max = min(lon_list), max(lon_list)


        # calculate latitude and longitude ranges in radians
        lat_min_rad = lat_min * (pi / 180)
        lat_max_rad = lat_max * (pi / 180)
        lon_min_rad = lon_min * (pi / 180)
        lon_max_rad = lon_max * (pi / 180)

        # preprocess latitude and longitude values
        for pid, (lat, lon) in self.raw_lat_lon.items():
            try:
                lat_rad = float(lat) * (pi / 180)
                lon_rad = float(lon) * (pi / 180)
            except ValueError:
                continue

            # normalize latitude and longitude to the range [0, 1]
            lat_normalized = (lat_rad - lat_min_rad) / (lat_max_rad - lat_min_rad)
            lon_normalized = (lon_rad - lon_min_rad) / (lon_max_rad - lon_min_rad)

            self.new_lat_lon[pid] = [lat_normalized, lon_normalized]
            self.lat_lon_radians[pid] = [lat_rad, lon_rad]

        # split sessions into train and test sets
        for uid, sessions in self.data_filtered.items():
            num_sessions = len(sessions)
            num_train = int(num_sessions * self.train_ratio)

            train_sessions = sessions[:num_train]
            test_sessions = sessions[num_train:]

            if len(train_sessions) >= self.session_minlen:
                self.train_data[uid] = self.generate_samples(train_sessions)
                self.train_id[uid] = [session[0] for session in train_sessions]

            if len(test_sessions) >= self.session_minlen:
                self.test_data[uid] = self.generate_samples(test_sessions)
                self.test_id[uid] = [session[0] for session in test_sessions]

    def generate_samples(self, sessions):
      samples = {'loc': [], 'tim': [], 'target': []}
      session_id = sessions[0][0]
      samples['loc'].append(session_id)
      samples['tim'].append(0)
      samples['target'].append(0)

      for session in sessions:
          loc, tim, category = session[0], session[1], session[2]  # Include the venue category
          samples['loc'].append(category)  # Use the category instead of loc
          samples['tim'].append(tim)
          samples['target'].append(category)  # Use the category instead of loc

      return samples


    def save_preprocessed_data(self):
      with open('/content/dataset_TSMC2014_NYC_preprocessed.csv', 'w', newline='') as f:
          writer = csv.writer(f)
          writer.writerow(['uid', 'sid', 'loc', 'tim', 'target'])  # Write header

          for uid, sessions in self.train_data.items():
              for i in range(len(sessions['loc'])):
                  if (
                      i < len(self.train_id[uid])
                      and i < len(sessions['loc'])
                      and i < len(sessions['tim'])
                      and i < len(sessions['target'])
                  ):
                      loc = sessions['loc'][i]
                      target = sessions['target'][i]
                      writer.writerow([uid, self.train_id[uid][i], loc, sessions['tim'][i], target])

          for uid, sessions in self.test_data.items():
              for i in range(len(sessions['loc'])):
                  if (
                      i < len(self.test_id[uid])
                      and i < len(sessions['loc'])
                      and i < len(sessions['tim'])
                      and i < len(sessions['target'])
                  ):
                      loc = sessions['loc'][i]
                      target = sessions['target'][i]
                      writer.writerow([uid, self.test_id[uid][i], loc, sessions['tim'][i], target])


def main():
    params = argparse.Namespace(
        data_name='NYC',
        train_ratio=0.8,
        session_threshold=30,
        session_minlen=2,
        user_record_min=1,
        poi_record_min=1,
        cat_contained=True,
        time_format="%a %b %d %H:%M:%S +0000 %Y"
    )
    # Create an instance of DataGeneration class with the params dictionary
    data_generation = DataGeneration(params)
    data_generation.load_trajectory()
    data_generation.filter_and_divide_sessions()
    data_generation.preprocess_and_split()
    data_generation.save_preprocessed_data()

    print("Sample Data:")
    print("User IDs:", data_generation.uid_list[:5])
    print("Location ID Map:", data_generation.pid_dict)
    print("Train Data:", data_generation.train_data[data_generation.uid_list[0]])
    print("Test Data:", data_generation.test_data[data_generation.uid_list[0]])

if __name__ == '__main__':
    preprocessing()
    main()


Step 1: Load Trajectory Data: The load_trajectory method reads the trajectory data from a CSV file (dataset_TSMC2014_NYC_merged.csv). It iterates over the records and extracts information such as user ID, venue ID, venue category ID, latitude, longitude, and timezone offset. The data is stored in various dictionaries to keep track of raw data, location counts, category mappings, etc.

Step 2: Filter and Divide Sessions: The filter_and_divide_sessions method filters the users and locations based on certain criteria (minimum user records and minimum location records). It then divides the filtered user trajectories into sessions based on the session threshold (maximum time gap between consecutive records) and session minimum length. Valid sessions are stored in the data_filtered dictionary.

Step 3: Preprocess and Split Data: The preprocess_and_split method preprocesses the data and splits it into train and test sets. It calculates latitude and longitude ranges, preprocesses latitude and longitude values by normalizing them, and stores the new latitude-longitude mappings. Then, it iterates over the filtered user trajectories and splits them into train and test sessions based on the train ratio provided in the params. For each user, samples are generated by converting location IDs to category IDs and creating target sequences. The train and test data are stored in the train_data and test_data dictionaries, respectively.

Saving Preprocessed Data: The save_preprocessed_data method saves the preprocessed data into a CSV file (dataset_TSMC2014_NYC_preprocessed.csv). It writes the header and iterates over the train and test data, writing the user ID, session ID, location, timestamp, and target into the file.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
dataset = pd.read_csv('/content/dataset_TSMC2014_NYC_preprocessed.csv')

# Sort the dataset by user and timestamp
dataset = dataset.sort_values(['uid', 'tim'])

# Create 'Current Location' and 'Next Location' columns
dataset['Current Location'] = dataset['loc']
dataset['Next Location'] = dataset.groupby('uid')['loc'].shift(-1)

# Drop the rows with missing next locations
dataset = dataset.dropna(subset=['Next Location'])

# Convert tim column to datetime with error handling
dataset['tim'] = pd.to_datetime(dataset['tim'], errors='coerce')

# Remove rows with invalid tims
dataset = dataset.dropna(subset=['tim'])

# Extract features from tim
dataset['Hour'] = dataset['tim'].dt.hour
dataset['Minute'] = dataset['tim'].dt.minute
dataset['DayOfWeek'] = dataset['tim'].dt.dayofweek

# Encode categorical features
label_encoder = LabelEncoder()
dataset['Current Location'] = label_encoder.fit_transform(dataset['Current Location'])
dataset['Next Location'] = label_encoder.transform(dataset['Next Location'])

# Split the dataset into features (X) and target (y)
X = dataset[['Current Location', 'Hour', 'Minute', 'DayOfWeek']]
y = dataset['Next Location']

# Split the encoded features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using Min-Max scaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Reshape the input data for LSTM
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(1, X_train.shape[2])))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
loss = model.evaluate(X_test, y_test)

# Make predictions
predictions = model.predict(X_test)

# Print the predictions
print(predictions)


In [None]:
# Inverse transform the numerical predictions to categorical labels
predicted_labels = label_encoder.inverse_transform(predictions.flatten().astype(int))

# Print the predicted labels (locations)
print(predicted_labels)

['Miscellaneous Shop' 'Miscellaneous Shop' 'Hotel' ... 'Market'
 'General Travel' 'German Restaurant']


In [None]:
from datetime import datetime

# Get the current time information
now = datetime.now()
hour = now.hour
minute = now.minute
day_of_week = now.weekday()

# Preprocess the input location
input_location = 'Food & Drink Shop'
preprocessed_location = label_encoder.transform([input_location])[0]

# Create a sample input with relevant features
sample_input = np.array([[preprocessed_location, hour, minute, day_of_week]])

# Scale the features
sample_input = scaler.transform(sample_input)

# Reshape the sample input
sample_input = np.reshape(sample_input, (sample_input.shape[0], 1, sample_input.shape[1]))

# Use the trained model to predict the next location
predicted_next_location = model.predict(sample_input)

# Inverse transform the predicted numerical value to the original categorical label
predicted_next_location_label = label_encoder.inverse_transform(predicted_next_location.flatten().astype(int))

# Get the predicted probabilities for each location
predicted_probabilities = model.predict(X_test)

# Get the indices of the top 5 highest probabilities excluding the predicted next location
top_5_indices = np.argsort(-predicted_probabilities.flatten())[:6]  # Increase the range to 6 to exclude the current location

# Filter out unseen labels and the predicted next location, and get the corresponding predicted next locations
top_5_next_locations = []
for index in top_5_indices:
    if index < len(label_encoder.classes_) and index != predicted_next_location:
        next_location = label_encoder.inverse_transform([index])[0]
        top_5_next_locations.append(next_location)

# Print the predicted next location and the next 5 most probable locations
print('Predicted Next Location:', predicted_next_location_label)


Load the preprocessed dataset from the CSV file.
Sort the dataset by user and timestamp to ensure chronological order of the records.
Create 'Current Location' and 'Next Location' columns based on the 'loc' column. These columns represent the current and subsequent locations for each user.
Drop the rows with missing next locations (last location for each user).
Convert the 'tim' column to datetime format with error handling.
Remove rows with invalid timestamps.
Extract additional features from the 'tim' column, such as hour, minute, and day of the week.
Encode the categorical features ('Current Location' and 'Next Location') using LabelEncoder to convert them into numeric values.
Split the dataset into features (X) and target (y).
Split the encoded features and target into train and test sets using train_test_split.
Scale the features using Min-Max scaler to normalize the values.
Reshape the input data for the LSTM model by adding an additional dimension.
Build the LSTM model using the Sequential model from Keras. The model consists of an LSTM layer with 64 units and a Dense layer with 1 unit.
Compile the model with mean squared error (MSE) as the loss function and Adam optimizer.
Train the model on the training data for 10 epochs with a batch size of 32.
Evaluate the trained model on the test set and calculate the loss.
Make predictions using the trained model on the test set.

In [None]:
# Get the predicted next location
predicted_next_location = model.predict(X_test)

# Convert predicted labels to categorical values
predicted_labels = label_encoder.inverse_transform(predicted_next_location.flatten().astype(int))

# Convert actual labels to categorical values
actual_labels = label_encoder.inverse_transform(y_test)

# Calculate accuracy
accuracy = (predicted_labels == actual_labels).mean() * 100

# Print accuracy
print('Accuracy:', accuracy)


Accuracy: 0.8605234851201148
