# Transportation Classification Model

In [None]:
# Import the Required Libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Library to suppress warnings or deprecation notes

import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image
from sklearn import tree

## Getting the Data Set

In [None]:
# Read the CSV data file into a data frame

df = pd.read_csv('../input/transportation/transportation - Sheet1.csv')

In [None]:
# Inspecting the data frame

df.head(10)

In [None]:
# Using the shape function to understand the shape of the data
df.shape

In [None]:
# Which data types do we have?

df.info()

In [None]:
# Summarize the data

df.describe(include = 'all').T

In [None]:
# Check for any missing values

df.isnull().sum()

# Observations
* The sample data consists of ten samples with four features and a dependent variable - transport mode
* Gender is binary
* Car ownership is numeric
* Travel cost is categorical
* Income level is categorical
* Transport Mode is categorical

## Preparing the Data for the Statistical Model

In [None]:
# Convert columns of type 'Object' to 'Categorical'
for feature in df.columns:
    if df[feature].dtype == 'object':
        df[feature] = pd.Categorical(df[feature])
df.head(10)

In [None]:
# Check the data types again

df.info()

In [None]:
# Create a datastructure that represents the transformations we want to make

replace_struct = {
    "Gender": {'Male': 0, 'Female': 1},
    "TravelCost": {'cheap': 0, 'standard': 1, 'expensive': 2},
    "IncomeLevel": {'low': 0, 'medium': 1, 'high': 2},
    "TransportMode": {'bus': 0, 'train': 1, 'car': 2}
}

In [None]:
# Perform the data transformation

df = df.replace(replace_struct)

In [None]:
# Inspect the resulting data frame

df.head(10)

In [None]:
df.info()

In [None]:
# Encode the categorical variables for X and y

X = df.drop(['TransportMode'], axis = 1)
y = df['TransportMode']

In [None]:
# Split the data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [None]:
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in test data =", X_test.shape[0])

In [None]:
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))

In [None]:
# Build Decision Model

dTree = DecisionTreeClassifier(criterion = "gini", random_state = 1)
dTree.fit(X_train, y_train)

In [None]:
# Scoring the Decision Model
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))

In [None]:
## Function to create confusion matrix
def make_confusion_matrix(model,y_actual,labels=[2, 1, 0]):
    '''
    model : classifier to predict values of X
    y_actual : ground truth  
    
    '''
    y_predict = model.predict(X_test)
    cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1, 2])
    df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - Bus","Actual - Train","Actual - Car"]],
                  columns = [i for i in ['Predicted - Bus','Predicted - Train','Predicted - Car']])
    group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}" for v1, v2 in
              zip(group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(3,3)
    plt.figure(figsize = (10,7))
    sns.heatmap(df_cm, annot=labels,fmt='')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Function to calculate the Recall score
def get_recall_score(model):
    '''
        model : classifier to predict values of X
    '''
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    print("Recall on training set : ", metrics.recall_score(y_train, pred_train, average = None))
    print("Recall on test set : ", metrics.recall_score(y_test, pred_test, average = None))

In [None]:
make_confusion_matrix(dTree, y_test)

In [None]:
# Recall on train and test
get_recall_score(dTree)

In [None]:
feature_names = list(X.columns)
print(feature_names)

In [None]:
plt.figure(figsize=(10,10))
tree.plot_tree(dTree,feature_names=feature_names,filled=True,fontsize=9,node_ids=True,class_names=True)
plt.show()

In [None]:
# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )

print (pd.DataFrame(dTree.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))

In [None]:
# We have some new sample data - we want to predict for each sample the mode of transportation

firstNames = ['Scott', 'Dan', 'Jason', 'SatorBlade']
gender = [0, 0, 0, 1]
carOwnership = [3, 1, 1, 0]
incomeLevel = [2, 0, 2, 0]
travelCost = [1, 0, 2, 0]
test_data = pd.DataFrame()
test_data['Gender'] = gender
test_data['TravelCost'] = travelCost
test_data['CarOwnership'] = carOwnership
test_data['IncomeLevel'] = incomeLevel
test_data.head()
dTree.predict(test_data)


The model predicts that Scott will travel by car, Dan will travel by train, Jason will travel by train, and SatorBlade will travel by bus.