In [1]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('eeg_dataset.csv')

# Separate out the x_data and y_data.
x_data = data.loc[:, data.columns != "y"]
y_data = data.loc[:, "y"]

# The random state to use while splitting the data.
random_state = 100

In [3]:
random_state

100

In [4]:
# XXX
# TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test.
# Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 100.
# XXX

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.30, random_state=random_state, shuffle=True)

regr = LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)

# Make predictions using the testing set
y_test_pred_regr = regr.predict(x_test)
y_train_pred_regr = regr.predict(x_train)
test_accuracy_regr = round(accuracy_score(y_test, y_test_pred_regr.round()), 2)
train_accurac_regr = round(accuracy_score(y_train, y_train_pred_regr.round()), 2)



In [5]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_test_pred_rf = rf.predict(x_test)
y_train_pred_rf = rf.predict(x_train)
test_accuracy_rf = round(accuracy_score(y_test, y_test_pred_rf.round()), 2)
train_accurac_rf = round(accuracy_score(y_train, y_train_pred_rf.round()), 2)



In [6]:
svc = SVC()
svc.fit(x_train, y_train)
y_test_pred_svc = svc.predict(x_test)
y_train_pred_svc = svc.predict(x_train)
test_accuracy_svc = round(accuracy_score(y_test, y_test_pred_svc.round()), 2)
train_accurac_svc = round(accuracy_score(y_train, y_train_pred_svc.round()), 2)



In [7]:
test_accuracy_svc

0.54

In [8]:
train_accurac_svc

1.0

In [9]:
scaler = StandardScaler()

scaler_train = scaler.fit(x_train)

# scaler_train

# scaler_train.mean_

pp =scaler_train.transform(x_train)

# pp

# rf

params_rf = {
    'bootstrap': [True],
    'max_depth': [14, 40, 60, 100],
    'n_estimators': [14, 35, 60, 85]
}



rf_tune = GridSearchCV(estimator = rf, param_grid = params_rf, cv=10, n_jobs = -1, verbose = 2)

rf_tune.fit(pp, y_train)

print(rf_tune.best_params_)

rf2 = RandomForestClassifier(n_estimators=85, max_depth=60)

rf2.fit(pp, y_train)

y_test_pred_rf_tune = rf2.predict(scaler.transform(x_test))

test_accuracy_rf_tune = round(accuracy_score(y_test, y_test_pred_rf_tune.round()), 2)

# print(test_accuracy_rf_tune)
print(rf_tune.best_score_)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   33.2s finished


{'bootstrap': True, 'max_depth': 60, 'n_estimators': 85}
0.9228495136372306


In [10]:
print(rf_tune.best_score_)

0.9228495136372306


In [11]:
params_svc = {'kernel':('linear', 'rbf'), 'C':[0.001, 0.01, 0.1, 1, 10]}

svc_tune = GridSearchCV(estimator = svc, param_grid = params_svc, cv=10, n_jobs = -1, verbose = 2)

svc_tune.fit(pp, y_train)

print(svc_tune.best_params_)

svc2 = SVC(C=10, kernel='rbf')

svc2.fit(pp, y_train)

y_test_pred_svc_tune = svc2.predict(scaler.transform(x_test))

test_accuracy_svc_tune = round(accuracy_score(y_test, y_test_pred_svc_tune.round()), 2)
# print(test_accuracy_svc_tune)


print(round(svc_tune.best_score_, 2))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.5min finished


{'C': 10, 'kernel': 'rbf'}
0.77


In [12]:
svc_tune.cv_results_.keys()


dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_kernel', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'split5_train_score', 'split6_train_score', 'split7_train_score', 'split8_train_score', 'split9_train_score', 'mean_train_score', 'std_train_score'])

In [28]:
svc_tune.cv_results_['mean_fit_time'][9]

svc_tune.cv_results_['mean_train_score'][9]

svc_tune.cv_results_['mean_test_score'][9]

4.76426362991333

In [35]:
important_features_dict = {}
for x,i in enumerate(rf.feature_importances_):
    important_features_dict[x]=i


important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)



In [36]:
print('Most important features: %s', important_features_list)

Most important features: %s [6, 5, 1, 0, 13, 12, 11, 3, 10, 4, 9, 2, 7, 8]


In [37]:
pca = PCA(n_components=10,svd_solver='full')

In [38]:
pca.fit(x_data)

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [42]:
print(pca.explained_variance_ratio_)

[5.05244700e-01 3.76936309e-01 1.17729460e-01 4.59941145e-05
 1.92788922e-05 1.12325809e-05 6.78642894e-06 1.88971137e-06
 1.54088292e-06 8.24289754e-07]


In [40]:
print(pca.singular_values_)

[886690.55021511 765870.22149031 428019.7135883    8460.03827621
   5477.2458465    4180.81523164   3249.68937137   1714.82156063
   1548.48148676   1132.55981354]


In [43]:
print(pca.explained_variance_ratio_*100)

[5.05244700e+01 3.76936309e+01 1.17729460e+01 4.59941145e-03
 1.92788922e-03 1.12325809e-03 6.78642894e-04 1.88971137e-04
 1.54088292e-04 8.24289754e-05]
