# Assignment 3
* Please use an example dataset of travel diary data (linked trip) which are uploaded on LearnUS 
    * Class Files -> Travel diary -> Travel_data_Linked.csv
* For detail information of each column, check “Travel_data_Linked_description.xlsx”)

# Set up

## Import

In [39]:
import os
import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import tree

from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz

## Load Dataset

In [40]:
data_dir = 'D:/OneDrive - 연세대학교 (Yonsei University)/Lectures/2022-2_모빌리티데이터분석및활용/dataset'

In [41]:
data_file = 'Travel_data_Linked.csv'

In [42]:
data_path = os.path.join(data_dir, data_file)

In [43]:
df = pd.read_csv(data_path)

# Q1
* Develop a decision tree model of forecasting trip purpose by considering the following conditions:
    - Use the variables included in the dataset only (Do not use external data)
    - Create new variables (i.e. features) using the variables if necessary (e.g., travel time, etc.)
    - Apply F1-score as the measurement of the performance
*  Submit Jupyter Notebook file of your model and results
    - (You don’t need to submit the tree structure, but F1-score should be stated in the results.)

## Create Variables

### Trip
* `Trip_order` : The order of the number of daily trips of a specific person (`P_ID`)
* `Trip_final` : A dummy variable representing 1 if it is the last trip of a specific person (`P_ID`) and 0 otherwise.
* `Trip_start` : A dummy variable indicating 1 if it is the first trip among daily passes of a specific person (`P_ID`), and 0 otherwise.
* `Trip_time` : Travel time (minutes)

In [44]:
df.reset_index(inplace = True)

df['Trip_order'] = None
df['Trip_final'] = None

House_list = df['HH_ID'].unique()

for house in tqdm(House_list):
    
    house_trip = df[df['HH_ID'] == house]
    person_list = house_trip['P_ID'].unique()
    
    for person in person_list:
        person_trip = house_trip[house_trip['P_ID'] == person]
        
        i = 0
        
        for trip in range(len(person_trip)):
            
            trip_time = person_trip.iloc[trip]
            idx = person_trip.iloc[i]['index']
            
            df.at[idx, 'Trip_order'] = i
            df.at[idx, 'Trip_final'] = i
            
            i += 1
        
        df.at[idx, 'Trip_final'] = 1
        
df['Trip_final'] = df['Trip_final'].fillna(0)

100%|█████████████████████████████████████████████████████████████████████████████| 7390/7390 [00:28<00:00, 260.32it/s]


In [45]:
def Trip_start(Trip_order):
    if Trip_order == 0:
        return 1
    else:
        return 0
    
df['Trip_start'] = df['Trip_order'].apply(Trip_start)

In [46]:
def Trip_time_calculate(Trip_a_P, Trip_a_hh, Trip_a_mm, Trip_d_P, Trip_d_hh, Trip_d_mm):
    Trip_d = Trip_d_hh * 60 + Trip_d_mm
    Trip_a = Trip_a_hh * 60 + Trip_a_mm
    
    if Trip_a_P == Trip_d_P: # 둘의 오전, 오후 포함 시간대가 같은 경우
        Trip_time = Trip_a - Trip_d
        
    elif Trip_a_P == 1 and Trip_d_P == 2: #오전 출발, 오후 도착
        Trip_time = (Trip_a + 12 * 60) - Trip_d
        
    elif Trip_a_P == 2 and Trip_d_P == 3: #오후 출발, 다음날 도착
        Trip_time = (Trip_a + 24 * 60) - (Trip_d + 12 * 60)
        
    elif Trip_a_P == 1 and Trip_d_P == 3: #오전 출발, 다음날 도착
        Trip_time = (Trip_a + 24 * 60) - Trip_d
        
    else:
        Trip_time = None
        
    return Trip_time

df['Trip_time'] = df.apply(lambda x : Trip_time_calculate(x['Trip_a_P'], x['Trip_a_hh'], x['Trip_a_mm'], x['Trip_d_P'], x['Trip_d_hh'], x['Trip_d_mm']) , axis = 1 )

In [47]:
def Time_d_hour(Trip_d_hh, Trip_d_P):
    if Trip_d_P == 1:
        return Trip_d_hh
    
    elif Trip_d_P == 2:
        return (Trip_d_hh + 12)
    
    elif Trip_d_P == 3:
        return (Trip_d_hh + 24)
    
    else:
        pass
    
def Time_a_hour(Trip_a_hh, Trip_a_P):
    if Trip_a_P == 1:
        return Trip_a_hh
    
    elif Trip_a_P == 2:
        return (Trip_a_hh + 12)
    
    elif Trip_a_P == 3:
        return (Trip_a_hh + 24)
    
    else:
        pass

In [48]:
df['Trip_d_hour_24'] = df.apply(lambda x : Time_d_hour(x['Trip_d_hh'], x['Trip_d_P']), axis = 1)
df['Trip_a_hour_24'] = df.apply(lambda x : Time_a_hour(x['Trip_a_hh'], x['Trip_a_P']), axis = 1)

In [49]:
df.head(10)

Unnamed: 0,index,No,HH_ID,HHsize,HHsize5,HHinc,Ncars,P_ID,HHrel,Age,...,Trip_a_P,Trip_a_hh,Trip_a_mm,Trip_a_D,Trip_order,Trip_final,Trip_start,Trip_time,Trip_d_hour_24,Trip_a_hour_24
0,0,1,4889,2,2,4,1,1,1,36,...,1.0,7.0,30.0,1168064000.0,0,0,1,60.0,6.0,7.0
1,1,2,4889,2,2,4,1,1,1,36,...,2.0,9.0,40.0,1138060000.0,1,1,0,40.0,21.0,21.0
2,2,3,4889,2,2,4,1,2,2,36,...,1.0,8.0,30.0,1156055000.0,0,0,1,30.0,8.0,8.0
3,3,4,4889,2,2,4,1,2,2,36,...,2.0,8.0,0.0,1138060000.0,1,1,0,30.0,19.0,20.0
4,4,5,4918,4,3,4,1,1,1,44,...,1.0,7.0,35.0,1162070000.0,0,0,1,45.0,6.0,7.0
5,5,6,4918,4,3,4,1,1,1,44,...,2.0,8.0,10.0,1138060000.0,1,1,0,40.0,19.0,20.0
6,6,7,4918,4,3,4,1,2,2,38,...,2.0,2.0,15.0,1144073000.0,0,0,1,15.0,14.0,14.0
7,7,8,4918,4,3,4,1,2,2,38,...,2.0,4.0,20.0,1138060000.0,1,1,0,20.0,16.0,16.0
8,8,9,4918,4,3,4,1,3,3,7,...,1.0,8.0,15.0,1141070000.0,0,0,1,15.0,8.0,8.0
9,9,10,4918,4,3,4,1,3,3,7,...,2.0,1.0,15.0,1138060000.0,1,1,0,15.0,13.0,13.0


### Other Dummy Variables

In [50]:
def Old(Age):
    if Age >= 65:
        return 1
    else:
        return 0

In [51]:
def Dummy_car_ownership(Ncars):
    if Ncars == 1: # Car own
        return 1
    elif Ncars == 2: # Do not own
        return 0
    else:
        pass

In [52]:
def Dummy_HHrel_1(HHrel):
    if HHrel == 1: # 'Householder'
        return 1
    else:
        return 0
    
def Dummy_HHrel_2(HHrel):
    if HHrel == 2: # 'Spouse'
        return 1
    else:
        return 0
    
def Dummy_HHrel_3(HHrel):
    if HHrel == 3: # 'Children'
        return 1
    else:
        return 0
    
def Dummy_HHrel_4(HHrel):
    if HHrel == 4: #'Parents'
        return 1
    else:
        return 0
    
def Dummy_HHrel_5(HHrel):
    if HHrel == 5: #'etc.'
        return 1
    else:
        return 0

In [53]:
def Dummy_Gender(Gender):
    if Gender == 1: # Male
        return 1
    elif Gender == 2: # Female
        return 0
    else:
        pass

In [54]:
def Dummy_License(License):
    if License == 1: # Own
        return 1
    elif License == 2: # Do not own
        return 0
    else:
        pass

In [55]:
def Dummy_Student(School):
    if School == 1 or School == 2 or School == 3 or School == 4: # Student
        return 1
    else:
        return 0

In [56]:
def Dummy_Job_specialized(Job):
    if Job == 1: # Specialized Job
        return 1
    else:
        return 0

def Dummy_Job_normal(Job):
    if Job == 2 or Job == 3 or Job == 4 or Job == 6: # Retail, Service, Management, Simple Labor
        return 1
    else:
        return 0
    
def Dummy_Job_home(Job):
    if Job == 7: # Homeworker
        return 1
    else:
        return 0

In [57]:
def Dummy_Workdays_full(Workdays):
    if Workdays == 1 or Workdays == 2: # 5~7 days per week
        return 1
    else:
        return 0

In [58]:
def Dummy_Workhours_1(Workhours):
    if Workhours == 1: # Full time
        return 1
    else:
        return 0

def Dummy_Workhours_2(Workhours):
    if Workhours == 2: # part-time(less than 8 hours)
        return 1
    else:
        return 0

In [59]:
def Dummy_Trip_made(Trip_made):
    if Trip_made == 1: # Traveled
        return 1
    elif Trip_made == 2: # Not Traveled
        return 0
    else:
        pass

In [60]:
def Dummy_Trip_d_P_1(Trip_d_P):
    if Trip_d_P == 1: # AM
        return 1
    else:
        return 0
    
def Dummy_Trip_d_P_2(Trip_d_P):
    if Trip_d_P == 2: # PM
        return 1
    else:
        return 0
    
def Dummy_Trip_d_P_3(Trip_d_P):
    if Trip_d_P == 3: # AM(following day)
        return 1
    else:
        return 0

In [61]:
df['Old'] = df['Age'].apply(Old)

df['Car_ownership'] = df['Ncars'].apply(Dummy_car_ownership)

df['HHrel_1'] = df['HHrel'].apply(Dummy_HHrel_1)
df['HHrel_2'] = df['HHrel'].apply(Dummy_HHrel_2)
df['HHrel_3'] = df['HHrel'].apply(Dummy_HHrel_3)
df['HHrel_4'] = df['HHrel'].apply(Dummy_HHrel_4)
df['HHrel_5'] = df['HHrel'].apply(Dummy_HHrel_5)

df['Gender_DM'] = df['Gender'].apply(Dummy_Gender)
df['License_DM'] = df['Licence'].apply(Dummy_License)

df['Student'] = df['School'].apply(Dummy_Student)

df['Job_Specialized'] = df['Job'].apply(Dummy_Job_specialized)
df['Job_Normal'] = df['Job'].apply(Dummy_Job_normal)
df['Job_Home'] = df['Job'].apply(Dummy_Job_home)

df['Workdays_Full'] = df['Workdays'].apply(Dummy_Workdays_full)

df['Workhours_1'] = df['Workhours'].apply(Dummy_Workhours_1)

In [62]:
df.head(3)

Unnamed: 0,index,No,HH_ID,HHsize,HHsize5,HHinc,Ncars,P_ID,HHrel,Age,...,HHrel_4,HHrel_5,Gender_DM,License_DM,Student,Job_Specialized,Job_Normal,Job_Home,Workdays_Full,Workhours_1
0,0,1,4889,2,2,4,1,1,1,36,...,0,0,1,1,0,0,1,0,1,1
1,1,2,4889,2,2,4,1,1,1,36,...,0,0,1,1,0,0,1,0,1,1
2,2,3,4889,2,2,4,1,2,2,36,...,0,0,0,1,0,0,1,0,1,1


In [63]:
df.head(3)

Unnamed: 0,index,No,HH_ID,HHsize,HHsize5,HHinc,Ncars,P_ID,HHrel,Age,...,HHrel_4,HHrel_5,Gender_DM,License_DM,Student,Job_Specialized,Job_Normal,Job_Home,Workdays_Full,Workhours_1
0,0,1,4889,2,2,4,1,1,1,36,...,0,0,1,1,0,0,1,0,1,1
1,1,2,4889,2,2,4,1,1,1,36,...,0,0,1,1,0,0,1,0,1,1
2,2,3,4889,2,2,4,1,2,2,36,...,0,0,0,1,0,0,1,0,1,1


### Trip Purpose

In [64]:
def Trip_purpose(Trip_purpG):
    if Trip_purpG == 1:
        return 'Work'
    
    elif Trip_purpG == 2:
        return 'Education'
    
    elif Trip_purpG == 3:
        return 'Business'
    
    elif Trip_purpG == 4:
        return 'Shopping/social/leisure'
    
    elif Trip_purpG == 5:
        return 'Back-home'
    
    elif Trip_purpG == 6:
        return 'Other'
    
    else:
        pass

In [65]:
df['Trip_Purpose'] = df['Trip_purpG'].apply(Trip_purpose)

### Data Filtering

In [66]:
full = df[df['Trip_made'] == 1]

In [67]:
full.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full.dropna(inplace = True)


## Decision Tree
* forecasting **trip purpose**
* Variables:
    * Household Information : 
    * Household Member's Information :
    * Trip Information : 
* Apply F1-Score as the measurement of the performance

### Partitioning dataset into training set / test set

In [108]:
feature_names = [
    'Old', # Age >= 65
    'Trip_time', # Trip Time
    'HHrel_3', # HHrel == Children
    'Student', # Student
    'Job_Specialized', # Specialized Job
    'Job_Normal', # Retail, Service, Simple Labor, Management 
    'Job_Home', # Home
    'Workdays_Full', # Work More than 5 days 
    'Workhours_1', # Full time
    'Trip_d_hour_24', # Departure Time
    'Trip_a_hour_24', # Arrival Time
    'Trip_start', # First Trip of a day
    'Trip_final' # Last Trip of a day
]

In [109]:
X = full[feature_names]

In [110]:
y = full['Trip_Purpose']

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = 0.3,
                                                   random_state = 19891124,
                                                   stratify = y)

### Constructing Tree

In [112]:
DT = tree.DecisionTreeClassifier(criterion = 'gini', max_depth = 7, random_state = 19891124)

### Learning

In [113]:
DT.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=7, random_state=19891124)

## Result

### Plotting Tree

In [114]:
dot_data = export_graphviz(
    DT, out_file = None, feature_names = feature_names,
    class_names = ['Work', 'Education', 'Business', 'Shopping/social/leisure', 'Back-home', 'Other'],
    filled = True, rounded = True,
    special_characters = True)

graph = graph_from_dot_data(dot_data)
graph.write_png('DT.png')

True

### Evaluation

In [115]:
y_pred = DT.predict(X_test)

In [116]:
y_pred

array(['Work', 'Work', 'Back-home', ..., 'Work', 'Work', 'Back-home'],
      dtype=object)

### Confusion Matrix & Measurements

In [117]:
# Confusion matrix & measurements
print('==================== Confusion Matrix ====================')
print(confusion_matrix(y_test, y_pred))
print('')
print('==================== Classification Report ====================')
print(classification_report(y_test, y_pred))

[[2747   15    0    1   19   10]
 [  16  177    0    0   13   19]
 [   3    0    1    1    1    1]
 [  28   25    0    6   18   31]
 [ 120   21    0    6   87   22]
 [   7   16    0    6    1 2672]]

                         precision    recall  f1-score   support

              Back-home       0.94      0.98      0.96      2792
               Business       0.70      0.79      0.74       225
              Education       1.00      0.14      0.25         7
                  Other       0.30      0.06      0.09       108
Shopping/social/leisure       0.63      0.34      0.44       256
                   Work       0.97      0.99      0.98      2702

               accuracy                           0.93      6090
              macro avg       0.76      0.55      0.58      6090
           weighted avg       0.92      0.93      0.92      6090

