# Linear Discriminant Analysis

In [4]:
import pandas as pd
train = pd.read_csv('prep_train.csv').filter(items=['Reason for absence', 'Age', 'Work load Average/day ', 'Disciplinary failure', 'Education', 'Son', 'Absent'])
train.head(10)

Unnamed: 0,Reason for absence,Age,Work load Average/day,Disciplinary failure,Education,Son,Absent
0,1.0,0.0,0.194471,0.0,0.0,0.0,1.0
1,0.0,1.0,0.194471,1.0,0.0,0.0,0.0
2,0.5,0.0,0.194471,0.0,0.0,0.0,1.0
3,1.0,0.0,0.194471,0.0,0.0,0.0,1.0
4,0.5,0.0,0.194471,0.0,0.0,0.0,1.0
5,0.5,0.0,0.194471,0.0,0.0,0.0,1.0
6,1.0,0.0,0.194471,0.0,0.0,0.0,1.0
7,0.5,0.0,0.194471,0.0,0.0,1.0,1.0
8,1.0,0.0,0.194471,0.0,0.0,0.0,1.0
9,1.0,0.0,0.194471,0.0,1.0,0.0,1.0


In [5]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

param_grid = {
    'solver'                : ['svd'],
    'store_covariance'      : [True, False],
    'tol'                   : [1e-4, 1e-5, 1e-6, 1e-3, 1e-2]
    
}


X_train, X_test, y_train, y_test = train_test_split(train.drop(['Absent'],axis=1), 
                                                    train['Absent'], test_size=0.30, 
                                                    random_state=101)

grid1 = GridSearchCV(LinearDiscriminantAnalysis(),param_grid,refit=True,verbose=3, scoring='accuracy')
grid1.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.2s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] solver=svd, store_covariance=True, tol=0.0001 ...................
[CV]  solver=svd, store_covariance=True, tol=0.0001, score=0.788, total=   0.1s
[CV] solver=svd, store_covariance=True, tol=0.0001 ...................
[CV]  solver=svd, store_covariance=True, tol=0.0001, score=0.862, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=0.0001 ...................
[CV]  solver=svd, store_covariance=True, tol=0.0001, score=0.871, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=1e-05 ....................
[CV]  solver=svd, store_covariance=True, tol=1e-05, score=0.788, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=1e-05 ....................
[CV]  solver=svd, store_covariance=True, tol=1e-05, score=0.862, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=1e-05 ....................
[CV]  solver=svd, store_covariance=True, tol=1e-05, score=0.871, total=   0.0s
[CV] solver=svd, store_covariance=Tr

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LinearDiscriminantAnalysis(n_components=None,
                                                  priors=None, shrinkage=None,
                                                  solver='svd',
                                                  store_covariance=False,
                                                  tol=0.0001),
             iid='warn', n_jobs=None,
             param_grid={'solver': ['svd'], 'store_covariance': [True, False],
                         'tol': [0.0001, 1e-05, 1e-06, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [6]:
param_grid = {
    'solver'                : ['lsqr', 'eigen'],
    'shrinkage'             : [None, 'auto']    
}


X_train, X_test, y_train, y_test = train_test_split(train.drop(['Absent'],axis=1), 
                                                    train['Absent'], test_size=0.30, 
                                                    random_state=101)

grid2 = GridSearchCV(LinearDiscriminantAnalysis(),param_grid,refit=True,verbose=3, scoring='accuracy')
grid2.fit(X_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] shrinkage=None, solver=lsqr .....................................
[CV] ......... shrinkage=None, solver=lsqr, score=0.788, total=   0.0s
[CV] shrinkage=None, solver=lsqr .....................................
[CV] ......... shrinkage=None, solver=lsqr, score=0.862, total=   0.0s
[CV] shrinkage=None, solver=lsqr .....................................
[CV] ......... shrinkage=None, solver=lsqr, score=0.871, total=   0.0s
[CV] shrinkage=None, solver=eigen ....................................
[CV] ........ shrinkage=None, solver=eigen, score=0.788, total=   0.0s
[CV] shrinkage=None, solver=eigen ....................................
[CV] ........ shrinkage=None, solver=eigen, score=0.862, total=   0.0s
[CV] shrinkage=None, solver=eigen ....................................
[CV] ........ shrinkage=None, solver=eigen, score=0.871, total=   0.0s
[CV] shrinkage=auto, solver=lsqr .....................................
[CV] ......... sh

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LinearDiscriminantAnalysis(n_components=None,
                                                  priors=None, shrinkage=None,
                                                  solver='svd',
                                                  store_covariance=False,
                                                  tol=0.0001),
             iid='warn', n_jobs=None,
             param_grid={'shrinkage': [None, 'auto'],
                         'solver': ['lsqr', 'eigen']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [7]:
print(grid1.best_params_)
predictions = grid1.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print('-----------------------------------')
print(grid2.best_params_)
predictions = grid2.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print('-----------------------------------')

{'solver': 'svd', 'store_covariance': True, 'tol': 0.0001}
[[ 15  11]
 [  4 120]]
0.9
-----------------------------------
{'shrinkage': None, 'solver': 'lsqr'}
[[ 15  11]
 [  4 120]]
0.9
-----------------------------------


In [8]:
train = pd.read_csv('prep_train.csv').filter(items=['Reason for absence', 'Month of absence', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Son', 'Weight', 'Height', 'Body mass index', 'Absent'])

param_grid = {
    'solver'                : ['svd'],
    'store_covariance'      : [True, False],
    'tol'                   : [1e-4, 1e-5, 1e-6, 1e-3, 1e-2]
    
}


X_train, X_test, y_train, y_test = train_test_split(train.drop(['Absent'],axis=1), 
                                                    train['Absent'], test_size=0.30, 
                                                    random_state=101)

grid1 = GridSearchCV(LinearDiscriminantAnalysis(),param_grid,refit=True,verbose=3, scoring='accuracy')
grid1.fit(X_train,y_train)


param_grid = {
    'solver'                : ['lsqr', 'eigen'],
    'shrinkage'             : [None, 'auto']    
}


X_train, X_test, y_train, y_test = train_test_split(train.drop(['Absent'],axis=1), 
                                                    train['Absent'], test_size=0.30, 
                                                    random_state=101)

grid2 = GridSearchCV(LinearDiscriminantAnalysis(),param_grid,refit=True,verbose=3, scoring='accuracy')
grid2.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] solver=svd, store_covariance=True, tol=0.0001 ...................
[CV]  solver=svd, store_covariance=True, tol=0.0001, score=0.788, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=0.0001 ...................
[CV]  solver=svd, store_covariance=True, tol=0.0001, score=0.853, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=0.0001 ...................
[CV]  solver=svd, store_covariance=True, tol=0.0001, score=0.862, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=1e-05 ....................
[CV]  solver=svd, store_covariance=True, tol=1e-05, score=0.788, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=1e-05 ....................
[CV]  solver=svd, store_covariance=True, tol=1e-05, score=0.853, total=   0.0s
[CV] solver=svd, store_covariance=True, tol=1e-05 ....................
[CV]  solver=svd, store_covariance=True, tol=1e-05, score=0.862, total=   0.0s
[CV] solver=svd, store_covariance=Tr

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ......... shrinkage=auto, solver=lsqr, score=0.828, total=   0.0s
[CV] shrinkage=auto, solver=lsqr .....................................
[CV] ......... shrinkage=auto, solver=lsqr, score=0.853, total=   0.0s
[CV] shrinkage=auto, solver=eigen ....................................
[CV] ........ shrinkage=auto, solver=eigen, score=0.780, total=   0.0s
[CV] shrinkage=auto, solver=eigen ....................................
[CV] ........ shrinkage=auto, solver=eigen, score=0.828, total=   0.0s
[CV] shrinkage=auto, solver=eigen ....................................
[CV] ........ shrinkage=auto, solver=eigen, score=0.853, total=   0.0s


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.3s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LinearDiscriminantAnalysis(n_components=None,
                                                  priors=None, shrinkage=None,
                                                  solver='svd',
                                                  store_covariance=False,
                                                  tol=0.0001),
             iid='warn', n_jobs=None,
             param_grid={'shrinkage': [None, 'auto'],
                         'solver': ['lsqr', 'eigen']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [9]:
print(grid1.best_params_)
predictions = grid1.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print('-----------------------------------')
print(grid2.best_params_)
predictions = grid2.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print('-----------------------------------')

{'solver': 'svd', 'store_covariance': True, 'tol': 0.0001}
[[ 15  11]
 [  2 122]]
0.9133333333333333
-----------------------------------
{'shrinkage': None, 'solver': 'lsqr'}
[[ 15  11]
 [  3 121]]
0.9066666666666666
-----------------------------------
