# TI data travel exercise ML and DL

In [114]:
#Data comes from survey of UK residents at airport. 
#Our mission is to be able to predict the destination ("country" variable) 

In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dfI = pd.read_csv(r'D:\Downloads\tidatascienceexercise\input_data.csv', usecols = [0,1,3,4,5,6,7,8,9,10])
dfI.head(2)

Unnamed: 0,Year,quarter,mode,purpose,package,Age,Sex,duration,Expected spend,country
0,2018,Jan-Mar,Air,Holiday,Independent,0-15,Male,4-13 nights,818.76,Austria
1,2018,Jan-Mar,Air,Holiday,Independent,16-24,Male,4-13 nights,703.0,Austria


In [116]:

#We find out the row index of the NaN rows
dfI[dfI.isin([np.nan, np.inf, -np.inf]).any(1)].index
#Channel islands have the NaN values in spend col!!


Int64Index([], dtype='int64')

In [117]:

#We drop all rows with NaN in Expected spend (happen to be all the Channel Islands ones)
dfI.drop(dfI.index[[1790, 3318, 5522, 7030, 9215, 10931, 12738, 13977, 15936, 17250]], inplace = True)


In [118]:

#replace all null vals in spend col
dfI['Expected spend'].astype(str).replace([np.inf, -np.inf, 'NaN'], np.nan, inplace=True)


In [119]:
dfI.head(2)

Unnamed: 0,Year,quarter,mode,purpose,package,Age,Sex,duration,Expected spend,country
0,2018,Jan-Mar,Air,Holiday,Independent,0-15,Male,4-13 nights,818.76,Austria
1,2018,Jan-Mar,Air,Holiday,Independent,16-24,Male,4-13 nights,703.0,Austria


In [120]:

#DROPPING MOST COLS BEFORE ENCODING WITH DUMMIES
#Dropping cols to see if this improves accuracy
dfI.drop(columns = ['Year', 'quarter', 'purpose', 'package', 'Age', 'Sex'], inplace = True)

#Encoding categorical variables
dfI = pd.get_dummies(dfI, columns = ['duration', 'mode'], drop_first = True)

dfI.drop('mode_Tunnel', axis = 1, inplace = True)

dfI.head()


Unnamed: 0,Expected spend,country,duration_14-27 nights,duration_28-90 nights,duration_3-6 months,duration_4-13 nights,duration_6 months-year,duration_Nil Stay,mode_Sea
0,818.76,Austria,0,0,0,1,0,0,0
1,703.0,Austria,0,0,0,1,0,0,0
2,106.5,Austria,0,0,0,0,0,0,0
3,741.6,Austria,0,0,0,1,0,0,0
4,269.07,Austria,0,0,0,0,0,0,0


In [121]:

# #KEEPING ALL COLS BEFORE ENCODING CAT VARS WITH DUMMIES
# #Encoding categorical variables
# dfI = pd.get_dummies(dfI, columns = ['quarter', 'mode', 'purpose', 'package', 'Age', 'Sex',
#        'duration'], drop_first = True)
# dfI.head()


In [122]:
dfI.columns

Index(['Expected spend', 'country', 'duration_14-27 nights',
       'duration_28-90 nights', 'duration_3-6 months', 'duration_4-13 nights',
       'duration_6 months-year', 'duration_Nil Stay', 'mode_Sea'],
      dtype='object')

In [123]:
#NOTES AND THINGS TO TRY LATER
#https://www.youtube.com/watch?v=75OJvlhFUMY&list=PLZoTAELRMXVOnN_g96ayzXX5i7RRO0QhL&index=54

#Turn spend col to numeric
dfI['Expected spend'] = pd.to_numeric(dfI['Expected spend'], errors = 'coerce')

# #Simple imputer for spend column
# from sklearn.impute import SimpleImputer
# imp_mean = SimpleImputer(missing_values = '#Null!', fill_value = 0, strategy = 'mean')
# imp_mean.fit([dfI['Expected spend']])

# imp_other = SimpleImputer(missing_values = NaN, fill_value = 0, strategy = 'mean')
# imp_other.fit([dfI['Expected spend']])


#Scaling expenditure column
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
#Scale spend column
cols_to_scale = ['Expected spend']
dfI[cols_to_scale] = sc.fit_transform(dfI[cols_to_scale])



In [124]:
######################################################################################
#Before we can continue we must label encode the y variable

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(dfI['country'])
y = le.transform(dfI['country'])
y

#display(list(le.classes_))

#Testing random labels to use them later for identification of predicted classes
#We make sure we can identify countries back from the numbers that the model will output

#list(le.inverse_transform([1, 5, 21, 34]))

######################################################################################

array([ 1,  1,  1, ..., 55, 42, 27])

In [125]:
#Defining X and y
X = dfI.drop(['country'], 1)
y = y


In [126]:
#Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)


### Getting accuracy score

In [127]:
dfI.head(2)

Unnamed: 0,Expected spend,country,duration_14-27 nights,duration_28-90 nights,duration_3-6 months,duration_4-13 nights,duration_6 months-year,duration_Nil Stay,mode_Sea
0,0.092719,Austria,0,0,0,1,0,0,0
1,-0.001547,Austria,0,0,0,1,0,0,0


In [128]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

KNN = KNeighborsClassifier(n_neighbors = 500)
KNN.fit(X_train, y_train)
score = cross_val_score(KNN, X, y, cv = 5, scoring = 'accuracy').mean()
score

0.12316122384647979

In [145]:
#Randomized Search CV

params = {
    'n_neighbors' : [750, 1000, 1250],  
    'algorithm' : ['auto', 'ball_tree']
}

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
random_search = RandomizedSearchCV(KNN, param_distributions = params, scoring = 'accuracy', cv = 5, verbose = 1)

random_search.fit(X, y)

display(random_search.best_estimator_, random_search.best_params_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.5min finished


KNeighborsClassifier(algorithm='auto', leaf_size=90, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=750, p=2,
                     weights='uniform')

{'n_neighbors': 750, 'algorithm': 'auto'}

In [144]:
KNN = KNeighborsClassifier(algorithm='auto', leaf_size=90, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=750, p=2,
                     weights='uniform')
KNN.fit(X_train, y_train)
score = cross_val_score(KNN, X, y, cv = 5, scoring = 'accuracy').mean()
score

0.12408117153079438

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score

# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .25)

# from sklearn.neighbors import KNeighborsClassifier
# KNN = KNeighborsClassifier()
# KNN.fit(X_train, y_train)
# score = cross_val_score(KNN, X_test, y_test, cv = 5, scoring = 'accuracy').mean()
# score

## Testing feature selection (did not improve acc much)

In [None]:
#FEATURE SELECTION USING EXTRA TREES CLASSIFIER

# from sklearn.ensemble import ExtraTreesClassifier
# model = ExtraTreesClassifier()
# model.fit(X,y)
# print(model.feature_importances_)
# feat_importances = pd.Series(model.feature_importances_, index = dfI.drop(['country'], 1).columns)
# feat_importances.nlargest(10).plot(kind = 'barh')
# plt.show()


In [None]:
# import seaborn as sns
# sns.set()
# #CORRELATION HEATMAP
# plt.figure(figsize = (25,15))
# sns.heatmap(dfI.corr(), annot = True); plt.show()


In [None]:
# #FEATURE SELECTION USING LASSOCV
# from sklearn.linear_model import LassoCV
# modellasso = LassoCV(cv = 5).fit(X, np.ravel(y))
# pd.Series(modellasso.coef_, index = dfI.drop(['country'], 1).columns).sort_values().plot(kind = "barh"); plt.show()


## Testing different ML models

In [None]:
lksdjlf
#MULTI-MODEL TEST

from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor

from sklearn import model_selection

# Prepare an array with all the algorithms
models = []
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
#models.append(('LSVC', LinearSVC()))
models.append(('NB', GaussianNB()))
models.append(('RFC', RandomForestClassifier()))
models.append(('SVM', SVC(gamma = 'auto')))

#REGRESSION ALGORITHMS
#models.append(('LR', LogisticRegression(multi_class = 'auto', solver = 'lbfgs')))
# models.append(('DTR', DecisionTreeRegressor()))
# models.append(('SGDRegressor', linear_model.SGDRegressor())) 
# models.append(('BayesianRidge', linear_model.BayesianRidge()))
# models.append(('LassoLars', linear_model.LassoLars())) 
# models.append(('ARDRegression', linear_model.ARDRegression())) 
# models.append(('PassiveAggressiveRegressor', linear_model.PassiveAggressiveRegressor())) 
# models.append(('TheilSenRegressor', linear_model.TheilSenRegressor()))
# models.append(('LinearRegression', linear_model.LinearRegression())) 


from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
num_folds = 3
scoring = 'accuracy'

# Prepare the configuration to run the test
seed = 7
results = []
names = []
# X = train_set_scaled
# y = train_set_labels

# Every algorithm is tested and results are
# collected and printed
for name, model in models:
    kfold = KFold(
        n_splits=5, random_state=seed)
    cv_results = cross_val_score(
        model, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (
        name, cv_results.mean(), cv_results.std())
    print(msg)
    
    
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# #ONCE CHOSEN THE ALGORITHM, WE DO SCALING, PARAM GRID K FOLD AND GRID SEARCH #################
# # Build a scaler
# scaler = StandardScaler().fit(X_train)
# rescaledX = scaler.transform(X_train)

# # Build parameter grid
# c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
# kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
# param_grid = dict(C=c_values, kernel=kernel_values)

# # Build the model
# model = SVC()
# kfold = KFold(n_splits = num_folds, random_state = seed)
# grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
# grid_result = grid.fit(rescaledX, y_train)

# # Show the results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']

# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))


## Testing a Keras NN on the same data

In [None]:
### KERAS NEURAL NETWORK TEST
import tensorflow as tf
import matplotlib.pyplot as plt

X_train = tf.keras.utils.normalize(X_train.values, axis = 1)
X_test = tf.keras.utils.normalize(X_test.values, axis = 1)

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation = tf.nn.relu))
model.add(tf.keras.layers.Dropout(.25))
model.add(tf.keras.layers.Dense(128, activation = tf.nn.relu))
model.add(tf.keras.layers.Dropout(.25))
model.add(tf.keras.layers.Dense(61, activation = tf.nn.softmax))

model.compile(optimizer = 'adam',
             loss = 'sparse_categorical_crossentropy', 
             metrics = ['accuracy']) 

model.fit(X_train, y_train, epochs = 5)

val_loss, val_acc = model.evaluate(X_test, y_test)

print('..............\n')
print('val_loss (loss) is:')
print(val_loss)
print('val_acc (accuracy) is:')
print(val_acc)
