# Preparation

### Data for exercise

- "Pdata.csv" (Used in "Fundamental of data analysis")

### Data import & set-up

In [None]:
import pandas as pd
Pdata = pd.read_csv("Pdata.csv")
Pdata

In [None]:
# Create "Age"
Pdata["Age"] = 2016-Pdata["Byear"]
Pdata

In [None]:
# Categorize job: 'Unemployed' = 0, 'Employed' = 1
def Job_category(row):
    if (row['Job'] == 7)|(row['Job'] == 8):
        return 0 # Unemployed
    else:
        return 1 # Employed

Pdata['Employed'] = Pdata.apply(lambda row: Job_category(row), axis=1)

Pdata

In [None]:
# Transform nominal variable to bianry variable: Ncars => Own=1, Non-own=0
def Ncars_modi(row):
    if (row['Ncars'] == 1):
        return 1 # Own
    else:
        return 0 # Non-own

Pdata['Car_own'] = Pdata.apply(lambda row: Ncars_modi(row), axis=1)

Pdata

In [None]:
# Transform nominal variable to bianry variable: Gender => Male=1, Female=0
def Gender_modi(row):
    if (row['Gender'] == 1):
        return 1 # Male
    else:
        return 0 # Female

Pdata['Male'] = Pdata.apply(lambda row: Gender_modi(row), axis=1)

Pdata

In [None]:
# Transform nominal variable to bianry variable: License => Own=1, Non-own=0
def Licence_modi(row):
    if (row['Licence'] == 1):
        return 1 # Own
    else:
        return 0 # Non-own

Pdata['Licence_own'] = Pdata.apply(lambda row: Licence_modi(row), axis=1)

Pdata

In [None]:
# Select variable: Create a dataframe "Pdata_sel" with HHinc, NofTrips, TripTime, Work_TT, Work_AD, Shop_TT, Shop_AD, Age, Employed, Car_own, Male, Licence_own
Pdata_sel=Pdata[['HHinc', 'NofTrips', 'TripTime', 'Work_TT', 'Work_AD', 'Shop_TT', 'Shop_AD', 'Age', 'Employed', 'Car_own', 'Male', 'Licence_own']]
Pdata_sel

In [None]:
# Create a pair plot using Seaborn
import seaborn as sns
sns.pairplot(Pdata_sel[['HHinc', 'NofTrips', 'TripTime', 'Work_TT', 'Work_AD', 'Shop_TT', 'Shop_AD', 'Age']])

# About "Scikit-learn" Package

- https://scikit-learn.org/stable/

# Linear Regression

### Problem definition

- What type of person does spend much time for trips?
- Which socio-demographic characteristics does affect travel time?
- Can we predict an individual's travel time for a day if we know his/her socio-demographic chracteristics?

- Dependent variable = 'TripTime'
- Independent variables = 'HHinc', 'Age', 'Employed', 'Car_own', 'Male', 'Licence_own'

### Data cleaning

In [None]:
Pdata_sel.head(50)

In [None]:
# Drop NaN
Pdata_sel=Pdata_sel.dropna()

In [None]:
# Check the result
Pdata_sel.head(50)

### Parameter estimation (Learning)

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [None]:
# Using scikit-learn
from sklearn.linear_model import LinearRegression
LR=LinearRegression()
X=Pdata_sel[['HHinc', 'Age', 'Employed', 'Car_own', 'Male', 'Licence_own']]
y=Pdata_sel['TripTime']
LR.fit(X,y)

In [None]:
# Reporting results: Estimates (coefficient)
LR.coef_

In [None]:
# Reporting results: Intercept
LR.intercept_

In [None]:
# Reporting results: Score
LR.score(X,y)

### Prediction (Application)

Predict the travel time for a day of the following 5 persons using the estimated model

In [None]:
# Data of 5 persons
import numpy as np
Data = np.array([[2,35,1,1,1,1],[5,48,0,0,0,1],[3,55,1,0,1,1],[2,35,1,1,1,0],[1,21,0,1,0,0]])
Data

In [None]:
# Prediction
LR.predict(Data)

### Another method for estimating parameters and reporting the results

In [None]:
import statsmodels.api as sm

In [None]:
# Add contsnt
X2=sm.add_constant(X)
X2

In [None]:
# Estimation
Result = sm.OLS(y,X2)

In [None]:
# Reporting
print(Result.fit().summary())

# Logistic Regression

### Problem definition 1

- Which type of person does tend to conduct shopping? 
- What are the factors affecting shopping trip generation of each individual?

- Dependent variable = Shop
- Independent variables = 'Age', 'Employed', 'Car_own', 'Male', 'HHinc_1','HHinc_2','HHinc_3','HHinc_4','HHinc_5', 'Work_TT', 'Work_AD'

### Create "Shop" variable = 1 if shopping trip exists, otherwise 0

In [None]:
def Shop_create(row):
    if (row['Shop_TT'] > 0):
        return 1
    else:
        return 0

Pdata_sel['Shop'] = Pdata_sel.apply(lambda row: Shop_create(row), axis=1)

Pdata_sel

### Dummy coding for household income

In [None]:
def HHinc_1(row):
    if (row['HHinc'] == 1):
        return 1
    else:
        return 0
    
def HHinc_2(row):
    if (row['HHinc'] == 2):
        return 1
    else:
        return 0
    
def HHinc_3(row):
    if (row['HHinc'] == 3):
        return 1
    else:
        return 0

def HHinc_4(row):
    if (row['HHinc'] == 4):
        return 1
    else:
        return 0
    
def HHinc_5(row):
    if (row['HHinc'] == 5):
        return 1
    else:
        return 0

Pdata_sel['HHinc_1'] = Pdata_sel.apply(lambda row: HHinc_1(row), axis=1)
Pdata_sel['HHinc_2'] = Pdata_sel.apply(lambda row: HHinc_2(row), axis=1)
Pdata_sel['HHinc_3'] = Pdata_sel.apply(lambda row: HHinc_3(row), axis=1)
Pdata_sel['HHinc_4'] = Pdata_sel.apply(lambda row: HHinc_4(row), axis=1)
Pdata_sel['HHinc_5'] = Pdata_sel.apply(lambda row: HHinc_5(row), axis=1)

Pdata_sel

### Parameter estimation (Learning)

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
# Using scikit-learn
from sklearn.linear_model import LogisticRegression
LGR=LogisticRegression()
X=Pdata_sel[['Age', 'Car_own', 'Male', 'Work_TT', 'Work_AD']]
y=Pdata_sel['Shop']
LGR.fit(X,y)

In [None]:
# Reporting
print(LGR.coef_)
print(LGR.intercept_)
print(LGR.score(X,y))

### Another method for estimating parameters and reporting the results

In [None]:
# Using statsmodels
X2=sm.add_constant(X)
Result=sm.Logit(y,X2).fit()
print(Result.summary())

### Problem definition 2

- Can we determine an individual whether he/she owns a private car based on his/her mobility pattern?
- Develop a machine learning model of car-ownership judegment using mobility data of individual

- Dependent variable = Car_own
- Independent variables = 'Work_TT', 'Work_AD', 'Shop_TT', 'Shop_AD'

### Partitioning dataset into separate traning and test sets

In [None]:
# Using scikit-learn
from sklearn.model_selection import train_test_split
X=Pdata_sel[['Work_TT', 'Work_AD', 'Shop_TT', 'Shop_AD']]
y=Pdata_sel['Car_own']
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size=0.3, #30% => test set
                                                   random_state=0,
                                                    stratify=y)

In [None]:
X_train.describe()

In [None]:
X_test.describe()

In [None]:
y_train.describe()

In [None]:
y_test.describe()

### Parameter estimation (Learning)

In [None]:
LGR=LogisticRegression()
LGR.fit(X_train,y_train)
print(LGR.coef_)
print(LGR.intercept_)
print(LGR.score(X_train,y_train))

### Evaluation

In [None]:
# Predition of probability using the test data set
LGR.predict_proba(X_test)

In [None]:
# Predition of class label using the test data set
y_pred = LGR.predict(X_test)
y_pred

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Visualization of confusion matrix
sns.heatmap(confusion_matrix(y_test, y_pred), linewidths = 0.5)

### Application

Who does own a private car? and how likely?
- A: Work_TT=60, Work_AD=800, Shop_TT=10, Shop_AD=15
- B: Work_TT=30, Work_AD=500, Shop_TT=50, Shop_AD=100

In [None]:
Individual = np.array([[60,800,10,15],[30,500,50,100]])
Individual

In [None]:
LGR.predict(Individual)

In [None]:
LGR.predict_proba(Individual)