# LDA

In [30]:
from time import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
def write_predictions(predictions, file_name='pred.csv'):
    with open(file_name, 'w') as file:
        file.write('ID,Label')
        for index, value in enumerate(predictions):
            file.write('\n{0},{1}'.format(index+1, value))

In [3]:
X_train = pd.read_csv('Train/trainVectors.csv', header=None).transpose()
y_train = pd.read_csv('Train/trainLbls.csv', header=None, names=['label'])

X_validation = pd.read_csv('Validation/valVectors.csv', header=None).transpose()
y_validation = pd.read_csv('Validation/valLbls.csv', header=None, names=['label'])

X = pd.concat([X_train, X_validation]).reset_index(drop=True)
y = pd.concat([y_train, y_validation]).reset_index(drop=True)['label']

y_train = y_train['label']
y_validation = y_validation['label']

X_test = pd.read_csv('Test/testVectors.csv', header=None).transpose()

In [None]:
estimator = LinearDiscriminantAnalysis()
estimator.fit(X_train, y_train)
y_validation_predictions = estimator.predict(X_validation)
print('Validation Accuracy: %s' % accuracy_score(y_validation, y_validation_predictions))
y_test_predictions = estimator.predict(X_test)
write_predictions(y_test_predictions, 'simple_model_results/test-pred-lda-vanilla.csv')

In [23]:
time()

1526140216.0276356

In [27]:
cv = StratifiedKFold(10)
scores = []
estimators = []

for train_index, val_index in cv.split(X, y):
    start_time = time()
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    print('Fold contains {} training samples and {} validation samples.'.format(X_train.shape[0], X_val.shape[0]))
    
    estimator = LinearDiscriminantAnalysis()
    estimator.fit(X_train, y_train)
    print(' Training time: {} s'.format(round(time() - start_time)))
    
    y_val_pred = estimator.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(' Prediction accuracy: {0:.5f} '.format(acc))
    
    scores.append(acc)
    estimators.append(estimator)

mean_score = np.mean(scores)
stdev = np.std(scores)
print('Mean score {0:.5f}. Standard deviation: {1:.3f})'.format(mean_score, stdev))

(8128, 4096)
Fold contains 7304 training samples and 824 validation samples.
 Training time: 83 s
 Prediction accuracy: 0.808252427184466 
Fold contains 7304 training samples and 824 validation samples.
 Training time: 71 s
 Prediction accuracy: 0.7924757281553398 
Fold contains 7308 training samples and 820 validation samples.
 Training time: 71 s
 Prediction accuracy: 0.774390243902439 
Fold contains 7308 training samples and 820 validation samples.
 Training time: 71 s
 Prediction accuracy: 0.7841463414634147 
Fold contains 7317 training samples and 811 validation samples.




 Training time: 71 s
 Prediction accuracy: 0.8014796547472256 
Fold contains 7317 training samples and 811 validation samples.
 Training time: 71 s
 Prediction accuracy: 0.7928483353884094 
Fold contains 7321 training samples and 807 validation samples.
 Training time: 73 s
 Prediction accuracy: 0.8178438661710037 
Fold contains 7321 training samples and 807 validation samples.
 Training time: 70 s
 Prediction accuracy: 0.748451053283767 
Fold contains 7326 training samples and 802 validation samples.
 Training time: 71 s
 Prediction accuracy: 0.7418952618453866 
Fold contains 7326 training samples and 802 validation samples.
 Training time: 70 s
 Prediction accuracy: 0.729426433915212 
Mean score 0.7791209346056664 (0.028387292189462832)


In [29]:
cv = StratifiedKFold(10)
scores = []
estimators = []

for train_index, val_index in cv.split(X, y):
    start_time = time()
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    print('Fold contains {} training samples and {} validation samples.'.format(X_train.shape[0], X_val.shape[0]))
    
    estimator = Pipeline([
        ('transformer', QuantileTransformer(output_distribution='normal')), 
        ('lda', LinearDiscriminantAnalysis())
    ])
    estimator.fit(X_train, y_train)
    print(' Training time: {} s'.format(round(time() - start_time)))
    
    y_val_pred = estimator.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(' Prediction accuracy: {0:.5f} '.format(acc))
    
    scores.append(acc)
    estimators.append(estimator)

mean_score = np.mean(scores)
stdev = np.std(scores)
print('Mean score {0:.5f}. Standard deviation: {1:.3f})'.format(mean_score, stdev))

Fold contains 7304 training samples and 824 validation samples.
 Training time: 78 s
 Prediction accuracy: 0.76335 
Fold contains 7304 training samples and 824 validation samples.
 Training time: 78 s
 Prediction accuracy: 0.75728 
Fold contains 7308 training samples and 820 validation samples.
 Training time: 79 s
 Prediction accuracy: 0.72073 
Fold contains 7308 training samples and 820 validation samples.
 Training time: 78 s
 Prediction accuracy: 0.74024 
Fold contains 7317 training samples and 811 validation samples.




 Training time: 78 s
 Prediction accuracy: 0.75339 
Fold contains 7317 training samples and 811 validation samples.




 Training time: 78 s
 Prediction accuracy: 0.73613 
Fold contains 7321 training samples and 807 validation samples.
 Training time: 78 s
 Prediction accuracy: 0.75836 
Fold contains 7321 training samples and 807 validation samples.
 Training time: 78 s
 Prediction accuracy: 0.71623 
Fold contains 7326 training samples and 802 validation samples.




 Training time: 79 s
 Prediction accuracy: 0.69576 
Fold contains 7326 training samples and 802 validation samples.




 Training time: 79 s
 Prediction accuracy: 0.67207 
Mean score 0.73136. Standard deviation: 0.029)


In [31]:
cv = StratifiedKFold(10)
scores = []
estimators = []

for train_index, val_index in cv.split(X, y):
    start_time = time()
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    print('Fold contains {} training samples and {} validation samples.'.format(X_train.shape[0], X_val.shape[0]))
    
    estimator = QuadraticDiscriminantAnalysis()
    estimator.fit(X_train, y_train)
    print(' Training time: {} s'.format(round(time() - start_time)))
    
    y_val_pred = estimator.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(' Prediction accuracy: {0:.5f} '.format(acc))
    
    scores.append(acc)
    estimators.append(estimator)

mean_score = np.mean(scores)
stdev = np.std(scores)
print('Mean score {0:.5f}. Standard deviation: {1:.3f})'.format(mean_score, stdev))

Fold contains 7304 training samples and 824 validation samples.




 Training time: 7 s
 Prediction accuracy: 0.07888 
Fold contains 7304 training samples and 824 validation samples.
 Training time: 8 s
 Prediction accuracy: 0.05218 
Fold contains 7308 training samples and 820 validation samples.
 Training time: 8 s
 Prediction accuracy: 0.07927 
Fold contains 7308 training samples and 820 validation samples.
 Training time: 8 s
 Prediction accuracy: 0.06829 
Fold contains 7317 training samples and 811 validation samples.
 Training time: 10 s
 Prediction accuracy: 0.07152 
Fold contains 7317 training samples and 811 validation samples.
 Training time: 7 s
 Prediction accuracy: 0.07645 
Fold contains 7321 training samples and 807 validation samples.
 Training time: 7 s
 Prediction accuracy: 0.05948 
Fold contains 7321 training samples and 807 validation samples.
 Training time: 7 s
 Prediction accuracy: 0.06320 
Fold contains 7326 training samples and 802 validation samples.
 Training time: 7 s
 Prediction accuracy: 0.09102 
Fold contains 7326 training 