<a href="https://colab.research.google.com/github/sankar-010897/Python_projects/blob/main/IISC_Internship_Test_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import time
import dask.dataframe as dd
from dask import delayed

In [None]:
#Function to remove 0 duration rows
def trip_statistics(data):
  data = data[data['started_at'] != data['ended_at']]
  data = data.reset_index(drop = True)

  #maximum duration
  max_dur = max_duration(data)

  #minimum duration & number of trips
  min_dur, trips_with_min_duration = min_duration(data)

  #circular trips percentage for non 0 duration rows
  circ_trips_percent = circular_trips_percentage(data)

  print("Maximum duration of the trip: ",max_dur," mins")
  print("Minimum duration of the trip: ", min_dur," mins")
  print("Total number of trips corresponding to minimum duration: ", trips_with_min_duration)
  print("Percentage of total circular trips: ", round(circ_trips_percent,2),"%")

In [None]:
#Function to return maximum duration
def max_duration(data):
  duration= []
  rows = len(data)
  max = (data['ended_at'][0] - data['started_at'][0]).seconds/60

  for i in range(rows):
    try:
      dur = data['ended_at'][i] - data['started_at'][i]
      duration.append(dur.seconds/60)
    except:
      print("Key Error",i,rows)
  
  for time in duration:
    if time > max:
      max = time
  
  return max

In [None]:
#Function to return minimum duration
def min_duration(data):
  duration= []
  rows = len(data)
  min = (data['ended_at'][0] - data['started_at'][0]).seconds/60
  trips = 0

  for i in range(rows):
    dur = data['ended_at'][i] - data['started_at'][i]
    duration.append(dur.seconds/60)
  
  for time in duration:
    if time < min:
      min = time
  for time in duration:
    if time == min:
      trips += 1
  
  return min, trips

In [None]:
#Percentage of circular trips
def circular_trips_percentage(data):
  r = data[data['start_lat'] == data['end_lat']]
  r = r[r['start_lng'] == r['end_lng']]
  data_rows = len(data)
  circ_rows = len(r)

  return (circ_rows / data_rows) *100

In [None]:
#Function to find the number of feasible pairs of trips
def feasible_pairs(data):
  # creating two seperate dataframes trip_A and trip_B
  trip_A = data[['trip_id', 'ended_at', 'end_lat', 'end_lng']]
  trip_A.rename(columns = {'end_lat':'lat', 'end_lng':'lng'}, inplace = True)

  trip_B = data[['trip_id', 'started_at', 'start_lat', 'start_lng']]
  trip_B.rename(columns = {'start_lat':'lat', 'start_lng':'lng'}, inplace = True)

  #using merge to get the rows in which destination of trip-A is same as starting point of trip-B
  join = pd.merge(trip_A, trip_B, how = 'inner', on=['lat','lng'])

  #filtering the join such that trip-B starting time is greater than trip-A ending time
  filter_join = join.loc[(join['started_at'] >= join['ended_at'])]
  print('The total number of feasible pairs of trips are: ', len(filter_join)) 

In [None]:
# Reading the Bike Dataset and data pre-processing
data = pd.read_csv('bike_data_new.csv')
data['started_at'] = pd.to_datetime(data['started_at'])
data['ended_at'] = pd.to_datetime(data['ended_at'])

In [None]:
#Q1.1
start_time = time.time()
trip_statistics(data)
print("--- %s seconds ---" % (time.time() - start_time))

Maximum duration of the trip:  518.0  mins
Minimum duration of the trip:  1.0  mins
Total number of trips corresponding to minimum duration:  89
Percentage of total circular trips:  2.48 %
--- 0.5475168228149414 seconds ---


In [None]:
# Q1.2
#Filtering dataset to include only the trips between 06:00 AM and 06:00 PM
filtered_data = data.loc[(data['started_at'] >= '2023-01-02 06:00:00	') & (data['started_at'] <= '2023-01-02 18:00:00	')]

In [None]:
start_time = time.time()
feasible_pairs(filtered_data)
print("--- %s seconds ---" % (time.time() - start_time))

The total number of feasible pairs of trips are:  45540
--- 0.0658421516418457 seconds ---


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_A.rename(columns = {'end_lat':'lat', 'end_lng':'lng'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_B.rename(columns = {'start_lat':'lat', 'start_lng':'lng'}, inplace = True)


Q.2

In [None]:
# Functions
def individual_distance_travelled(gps_individual):
    # list of trajectory_id
    traj_list = gps_individual['trajectory_id'].unique()
    # total distance of each trajectory
    traj_distance = []

    for traj_id in traj_list:
        gps_individual_traj = gps_individual[gps_individual['trajectory_id'] == traj_id]
        polar_coord = gps_individual_traj[[
            'latitude', 'longitude', 'altitude']]
        d = get_traj_length(polar_coord).compute()
        traj_distance.append(d)

    traj_distance = np.array(traj_distance)
    # sum of total distances
    return np.sum(traj_distance)


@delayed
def get_traj_length(polar_coord):
    polar_coord = polar_coord.reset_index(drop=True)
    traj_length = []
    n = len(polar_coord)

    for i in range(0, n-1):
        p1 = polar_coord.iloc[i].values
        p2 = polar_coord.iloc[i+1].values
        dist = dist_bw_points(p1, p2)
        traj_length.append(dist)
    traj_length = np.array(traj_length)
    return np.sum(traj_length)


def dist_bw_points(p1, p2):
    p1 = polar_to_cartesian(p1)
    p2 = polar_to_cartesian(p2)

    dist = np.linalg.norm(p1-p2)
    return dist


def polar_to_cartesian(p):
    lat, lng, alt = p[0], p[1], p[2]
    # compute xyz
    x = alt * np.cos(lat) * np.sin(lng)
    y = alt * np.sin(lat)
    z = alt * np.cos(lat) * np.cos(lng)
    return np.array([x, y, z])


# Pre-Processing
start_time = time.time()
gps_data = dd.read_csv('combined_trajectories.csv')
user_distance_list = []
print("--- %s seconds for pre processing---" % (time.time() - start_time))
# for part-1
for i in range(1, 182):
    gps_individual = gps_data[gps_data['individual_id'] == i]
    gps_individual = gps_individual.reset_index(drop=True)
    start_time = time.time()
    d = delayed(individual_distance_travelled)(gps_individual).compute()
    # d = individual_distance_travelled(gps_individual) with parallelization using dask
    user_distance_list.append([i, d])
    print("--- %s seconds ---" % (time.time() - start_time))
    print('The distance travelled by user', i, ': ', d)
