In [1]:
##  Import the data and transform it into the train and test for future convenience

##  Import the required packages here.

import pandas as pd
import numpy as np
import missingno as msno
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
## Make the dataset for direction 2

# load the data
data = pd.read_csv('processed.csv')
data.head()

Unnamed: 0,age,IMD2019_decile,household_size,mor_travel,aft_travel,#car,unwill_walk,ethnicity_Other,ethnicity_Pakistani,ethnicity_White British,...,mode_now_Walk,mode_before_Bicycle,mode_before_Car/Van,mode_before_Other bus,mode_before_School Bus,mode_before_Taxi,mode_before_Train,mode_before_Walk,Y1,Y2
0,12,1,5.0,1.0,1.0,3.0,2.0,0,0,1,...,0,0,1,0,0,0,0,0,0.0,car-car
1,12,2,4.0,4.0,3.0,,,1,0,0,...,1,0,0,0,0,0,0,1,0.0,walk-walk
2,12,4,4.0,3.0,3.0,3.0,,0,0,1,...,1,0,0,0,0,0,0,1,0.0,walk-walk
3,12,1,3.0,2.0,3.0,3.0,,0,0,1,...,1,0,1,0,0,0,0,0,1.0,car-walk
4,12,8,4.0,5.0,4.0,3.0,,0,0,1,...,1,0,0,0,0,0,0,1,0.0,walk-walk


In [3]:
data.shape

(633, 28)

In [4]:
data['Y2'].value_counts()

car-car      255
walk-walk    188
walk-car      44
car-walk      37
bus-bus       32
car-bus       15
walk-bus       8
bus-car        3
bus-walk       1
Name: Y2, dtype: int64

In [5]:
# select the rows that we need
df=data[(data['mode_now_Car/Van']>0)|(data['mode_now_School Bus']>0)|data['mode_now_Walk']>0]
df=df[(df['mode_before_Car/Van']>0)|(df['mode_before_School Bus']>0)|(df['mode_before_Walk']>0)]

## select the columns by deleting high-missing-value rows and delete the all NA rows.
df = df.drop(['unwill_walk'],axis=1)
df = df.dropna(axis = 0, how = 'any')

## feature selection: save the basic contributes without co-linear variables.

df = df[['age', 'IMD2019_decile', 'household_size', 'mor_travel', 'aft_travel',
       '#car',  'ethnicity_Pakistani','ethnicity_White British', 'gender_Female','Y1','Y2']]
df['Y2'].value_counts()

car-car      210
walk-walk    163
car-walk      34
walk-car      34
bus-bus       26
car-bus       14
walk-bus       7
bus-car        2
bus-walk       1
Name: Y2, dtype: int64

In [6]:
## have to delete the bus-walk, bus-car, walk-bus
df=df[(df['Y2']!='bus-walk')]
df=df[(df['Y2']!='bus-car')]
df=df[(df['Y2']!='walk-bus')]
df['Y2'].value_counts()


car-car      210
walk-walk    163
car-walk      34
walk-car      34
bus-bus       26
car-bus       14
Name: Y2, dtype: int64

In [7]:
df.to_csv('data_for_direction2.csv')

In [8]:
X = df[['age', 'IMD2019_decile', 'household_size', 'mor_travel', 'aft_travel',
       '#car',  'ethnicity_Pakistani','ethnicity_White British', 'gender_Female']]
Y = df['Y2']
X_train, X_test, y_train, y_test = train_test_split(X,Y,stratify = Y)

In [9]:
## try svm
# SVM with tuned parameters: kernels, C, gamma, 

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

tuned_parameters = {'C': [0.1,0.01,0.001,0.0001],  
              'gamma': [0.1,0.01,0.001,0.0001], 
              'kernel': ['rbf','poly','sigmoid','linear']} 

election_model_svm = GridSearchCV(SVC(),tuned_parameters,cv=5)
ori_model_svm = election_model_svm.fit(X,Y)

In [10]:
pred = ori_model_svm.predict(X_test)
confusion_matrix(y_test, pred)

array([[ 0,  0,  6,  0,  0,  0],
       [ 0,  0,  3,  0,  0,  0],
       [ 0,  0, 49,  0,  0,  4],
       [ 0,  0,  1,  0,  0,  8],
       [ 0,  0,  8,  0,  0,  1],
       [ 0,  0,  4,  0,  0, 37]], dtype=int64)

In [11]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

     bus-bus       0.00      0.00      0.00         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.69      0.92      0.79        53
    car-walk       0.00      0.00      0.00         9
    walk-car       0.00      0.00      0.00         9
   walk-walk       0.74      0.90      0.81        41

    accuracy                           0.71       121
   macro avg       0.24      0.30      0.27       121
weighted avg       0.55      0.71      0.62       121



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
keys = []
scores = []
models = {'K-近邻': KNeighborsClassifier(),
          '逻辑回归 l1': LogisticRegressionCV(cv=5, penalty='l1',solver='liblinear'),
          '逻辑回归 l2': LogisticRegressionCV(cv=5, penalty='l2'),
          }

for k,v in models.items():
    mod = v
    mod.fit(X_train, y_train)
    pred = mod.predict(X_test)
    print(str(k) + '建模效果：' + '\n')
    print(classification_report(y_test, pred))
    acc = accuracy_score(y_test, pred)
    print('分类正确率：'+ str(acc)) 
    print('\n' + '\n')
    keys.append(k)
    scores.append(acc)
    table = pd.DataFrame({'model':keys, 'accuracy score':scores})
    print(confusion_matrix(y_test, pred))

#table

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


K-近邻建模效果：

              precision    recall  f1-score   support

     bus-bus       0.50      0.33      0.40         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.72      0.87      0.79        53
    car-walk       0.17      0.11      0.13         9
    walk-car       0.00      0.00      0.00         9
   walk-walk       0.70      0.78      0.74        41

    accuracy                           0.67       121
   macro avg       0.35      0.35      0.34       121
weighted avg       0.59      0.67      0.62       121

分类正确率：0.6694214876033058



[[ 2  1  3  0  0  0]
 [ 0  0  3  0  0  0]
 [ 2  0 46  1  0  4]
 [ 0  0  0  1  0  8]
 [ 0  0  7  0  0  2]
 [ 0  0  5  4  0 32]]
逻辑回归 l1建模效果：

              precision    recall  f1-score   support

     bus-bus       0.07      0.33      0.12         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.74      0.74      0.74        53
    car-walk       0.00      0.00      0.00         9
    wa

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_

逻辑回归 l2建模效果：

              precision    recall  f1-score   support

     bus-bus       0.00      0.00      0.00         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.72      0.91      0.80        53
    car-walk       0.00      0.00      0.00         9
    walk-car       0.00      0.00      0.00         9
   walk-walk       0.72      0.95      0.82        41

    accuracy                           0.72       121
   macro avg       0.24      0.31      0.27       121
weighted avg       0.56      0.72      0.63       121

分类正确率：0.71900826446281



[[ 0  0  6  0  0  0]
 [ 0  0  2  0  0  1]
 [ 0  0 48  0  0  5]
 [ 0  0  1  0  0  8]
 [ 0  0  8  0  0  1]
 [ 0  0  2  0  0 39]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier


keys = []
scores = []
models = {'决策树': DecisionTreeClassifier(),
          '高斯贝叶斯': GaussianNB(),
          '伯努利贝叶斯': BernoulliNB(),
          '多项式贝叶斯': MultinomialNB()}

for k,v in models.items():
    mod = v
    mod.fit(X_train, y_train)
    pred = mod.predict(X_test)
    print(str(k) + '建模效果：' + '\n')
    print(classification_report(y_test, pred))
    acc = accuracy_score(y_test, pred)
    print('分类正确率：'+ str(acc)) 
    print('\n' + '\n')
    keys.append(k)
    scores.append(acc)
    table = pd.DataFrame({'model':keys, 'accuracy score':scores})
    print(confusion_matrix(y_test, pred))


决策树建模效果：

              precision    recall  f1-score   support

     bus-bus       0.20      0.33      0.25         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.70      0.66      0.68        53
    car-walk       0.00      0.00      0.00         9
    walk-car       0.00      0.00      0.00         9
   walk-walk       0.67      0.68      0.67        41

    accuracy                           0.54       121
   macro avg       0.26      0.28      0.27       121
weighted avg       0.54      0.54      0.54       121

分类正确率：0.5371900826446281



[[ 2  0  4  0  0  0]
 [ 0  0  2  0  1  0]
 [ 5  3 35  1  3  6]
 [ 1  0  1  0  1  6]
 [ 2  0  5  0  0  2]
 [ 0  2  3  5  3 28]]
高斯贝叶斯建模效果：

              precision    recall  f1-score   support

     bus-bus       0.38      0.50      0.43         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.72      0.72      0.72        53
    car-walk       0.29      0.22      0.25         9
    walk-

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


from the above model results,  we can see, direction 2 has better model prediction results and can be learned better.

In [14]:
from sklearn.preprocessing import MinMaxScaler
# minmax: [0,1]
X_minmax_scaled = pd.DataFrame(MinMaxScaler().fit_transform(X),columns=X.columns)
X_minmax_scaled

Unnamed: 0,age,IMD2019_decile,household_size,mor_travel,aft_travel,#car,ethnicity_Pakistani,ethnicity_White British,gender_Female
0,1.000000,0.000000,0.250000,0.00,0.00,0.50,0.0,1.0,1.0
1,1.000000,0.333333,0.166667,0.50,0.50,0.50,0.0,1.0,1.0
2,1.000000,0.000000,0.083333,0.25,0.50,0.50,0.0,1.0,1.0
3,1.000000,0.777778,0.166667,1.00,0.75,0.50,0.0,1.0,0.0
4,0.333333,0.777778,0.166667,0.00,0.25,0.50,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
476,0.666667,0.000000,0.083333,1.00,0.75,0.25,0.0,0.0,0.0
477,0.666667,0.333333,0.083333,0.50,0.50,0.25,0.0,0.0,1.0
478,0.666667,0.777778,0.166667,1.00,1.00,0.25,0.0,1.0,0.0
479,0.000000,0.777778,0.166667,1.00,1.00,0.25,0.0,1.0,0.0


In [15]:
## z-score normalization : most of the data will fall in [-3.3]
from sklearn.preprocessing import StandardScaler
X_std_scaled = pd.DataFrame(StandardScaler().fit_transform(X),columns=X.columns)
X_std_scaled.head()

Unnamed: 0,age,IMD2019_decile,household_size,mor_travel,aft_travel,#car,ethnicity_Pakistani,ethnicity_White British,gender_Female
0,1.367124,-0.974967,0.32241,-1.084443,-1.195527,0.499211,-0.689536,0.989658,1.027402
1,1.367124,0.280053,-0.351847,0.031315,-0.017149,0.499211,-0.689536,0.989658,1.027402
2,1.367124,-0.974967,-1.026104,-0.526564,-0.017149,0.499211,-0.689536,0.989658,1.027402
3,1.367124,1.953413,-0.351847,1.147074,0.57204,0.499211,-0.689536,0.989658,-0.973329
4,-0.429561,1.953413,-0.351847,-1.084443,-0.606338,0.499211,-0.689536,0.989658,-0.973329


In [16]:
X_minmax_train, X_minmax_test, y_minmax_train, y_minmax_test = train_test_split(X_minmax_scaled,Y,stratify = Y)
X_zscore_train, X_zscore_test, y_zscore_train, y_zscore_test = train_test_split(X_std_scaled,Y,stratify = Y)

In [17]:
# SVM with tuned parameters: kernels, C, gamma, 

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

tuned_parameters = {'C': [0.1,0.01,0.001,0.0001],  
              'gamma': [0.1,0.01,0.001,0.0001], 
              'kernel': ['rbf','poly','sigmoid','linear']} 

election_model_svm = GridSearchCV(SVC(),tuned_parameters,cv=5)
minmax_model_svm = election_model_svm.fit(X_minmax_scaled,Y)
zscore_model_svm = election_model_svm.fit(X_std_scaled,Y)
ori_model_svm = election_model_svm.fit(X,Y)

In [18]:
minmax_predictions = minmax_model_svm.predict(X_minmax_test) 
zscore_pred = zscore_model_svm.predict(X_zscore_test)
ori_pred = ori_model_svm.predict(X_test)
print(classification_report(y_minmax_test, minmax_predictions))
print(classification_report(y_zscore_test, zscore_pred))
print(classification_report(y_test, ori_pred))

              precision    recall  f1-score   support

     bus-bus       0.00      0.00      0.00         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.44      1.00      0.61        53
    car-walk       0.00      0.00      0.00         9
    walk-car       0.00      0.00      0.00         9
   walk-walk       0.00      0.00      0.00        41

    accuracy                           0.44       121
   macro avg       0.07      0.17      0.10       121
weighted avg       0.19      0.44      0.27       121

              precision    recall  f1-score   support

     bus-bus       0.00      0.00      0.00         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.44      1.00      0.61        53
    car-walk       0.00      0.00      0.00         9
    walk-car       0.00      0.00      0.00         9
   walk-walk       0.00      0.00      0.00        41

    accuracy                           0.44       121
   macro avg       0.07

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
ori_pred

array(['car-car', 'car-car', 'walk-walk', 'car-car', 'car-car', 'car-car',
       'car-car', 'walk-walk', 'walk-walk', 'car-car', 'walk-walk',
       'walk-walk', 'car-car', 'walk-walk', 'car-car', 'car-car',
       'car-car', 'car-car', 'car-car', 'car-car', 'car-car', 'walk-walk',
       'walk-walk', 'car-car', 'walk-walk', 'car-car', 'car-car',
       'walk-walk', 'car-car', 'walk-walk', 'car-car', 'car-car',
       'walk-walk', 'walk-walk', 'car-car', 'walk-walk', 'car-car',
       'walk-walk', 'walk-walk', 'car-car', 'car-car', 'car-car',
       'car-car', 'walk-walk', 'car-car', 'car-car', 'walk-walk',
       'walk-walk', 'walk-walk', 'car-car', 'car-car', 'car-car',
       'car-car', 'car-car', 'walk-walk', 'walk-walk', 'walk-walk',
       'walk-walk', 'car-car', 'walk-walk', 'walk-walk', 'walk-walk',
       'car-car', 'car-car', 'walk-walk', 'walk-walk', 'walk-walk',
       'car-car', 'car-car', 'walk-walk', 'car-car', 'car-car', 'car-car',
       'car-car', 'car-car', 'car-car

In [19]:
from sklearn.linear_model import LogisticRegressionCV
clf_l1 = LogisticRegressionCV(cv=5, penalty='l1',solver='liblinear').fit(X, Y)
clf_l2 = LogisticRegressionCV(cv=5, penalty='l2').fit(X, Y)
clf_l1_std = LogisticRegressionCV(cv=5, penalty='l1',solver='liblinear').fit(X_std_scaled,Y)
clf_l2_std = LogisticRegressionCV(cv=5, penalty='l2').fit(X_std_scaled,Y)
clf_l1_mm = LogisticRegressionCV(cv=5, penalty='l1',solver='liblinear').fit(X_minmax_scaled,Y)
clf_l2_mm = LogisticRegressionCV(cv=5, penalty='l2').fit(X_minmax_scaled,Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
for i in [clf_l1,clf_l2]:
    print('model is', i)
    pred = i.predict(X_test)
    print(classification_report(y_test,pred))
    print(confusion_matrix(y_test, pred))
    

for i in [clf_l1_std,clf_l2_std]:
    print('model is', i)
    pred = i.predict(X_zscore_test)
    print(classification_report(y_zscore_test,pred))
    print(confusion_matrix(y_zscore_test, pred))

for i in [clf_l1_mm,clf_l2_mm]:
    print('model is', i)
    pred = i.predict(X_minmax_test)
    print(classification_report(y_minmax_test,pred))
    print(confusion_matrix(y_minmax_test, pred))

model is LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear')
              precision    recall  f1-score   support

     bus-bus       0.05      0.17      0.07         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.74      0.74      0.74        53
    car-walk       0.00      0.00      0.00         9
    walk-car       0.00      0.00      0.00         9
   walk-walk       0.74      0.83      0.78        41

    accuracy                           0.61       121
   macro avg       0.25      0.29      0.26       121
weighted avg       0.58      0.61      0.59       121

[[ 1  0  5  0  0  0]
 [ 2  0  1  0  0  0]
 [11  0 39  0  0  3]
 [ 1  0  0  0  0  8]
 [ 2  0  6  0  0  1]
 [ 5  0  2  0  0 34]]
model is LogisticRegressionCV(cv=5)
              precision    recall  f1-score   support

     bus-bus       0.00      0.00      0.00         6
     car-bus       0.00      0.00      0.00         3
     car-car       0.71      0.91      0.79        53
    car

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
A

NameError: name 'A' is not defined