# Wielowarstwowa sieć neuronowa

(*Multilayer perceptron*, *feedforward neural network*)



**Uwaga:** "Input layer" pomimo tego, że ma w nazwie słowo "warstwa", to tak naprawdę to nie jest żadna warstwa sieci... To są po prostu dane wejściowe... Niestety przyjęło się literaturze nazywanie tego w ten sposób, co jest mylące :(


Sieci uczy sie metodą spadku gradientu (pewnymi wariantami tej metody). Uczenie wykorzystuje algorytm **propagacji wstecznej** (https://en.wikipedia.org/wiki/Backpropagation).

<br>

<br>

<br>

**Uwaga!** Sieci neuronowe absolutnie zawsze wymagają zestandaryzowanych danych! Niezależnie od tego czy wykorzystujemy regularyzację czy nie i niezależnie od typu sieci!

<br>

<br>

### Fakt matematyczny: jednowarstwową siecią możemy otrzymać dowolny kształt. 

Co z tego wynika? To, że (teoretycznie) zawsze wystarczy sieć jednowarstwowa (odpowiednio duża). W praktyce rzeczywiście z reguły wystarcza jedna warstwa, ale mimo wszystko zawsze warto sprawdzić czy 2 (lub 3) nie zadziałają przypadkiem lepiej. Przy czym jeżeli dla dwóch wartsw jest gorzej, to nie ma sensu sprawdzać dla większej ilości.

In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score


from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Zad
* Wczytaj zbiór danych
* Podziel dane na train test
* Wykonaj uczenie modeli (dobierz najlepsze parametry)
    * LogisticRegression
    * LinearSVC
    * SVC
    * KNeighborsClassifier
    * DecisionTreeClassifier
    * RandomForestClassifier
    * BaggingClassifier
    * ExtraTreesClassifier
    * AdaBoostClassifier
    * GradientBoostingClassifier
    * VotingClassifier
    * xgboost.XGBClassifier
* Porównaj wyniki na zbiorze uczącym    

In [4]:
dataset = np.loadtxt('./datasets/diabetes.csv', delimiter=",", skiprows=1)

X = dataset[:,0:8]
Y = dataset[:,8]

print(X.shape)
print(np.mean(Y))

seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

from sklearn.model_selection import StratifiedKFold

seed=123
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

(768, 8)
0.3489583333333333


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC(C=1,kernel='rbf',probability=True))])

param_grid = {
            'preprocessing': [StandardScaler(), None],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_2 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_2.fit(X_train, y_train)
grid_2.best_params_

{'classifier__C': 10,
 'classifier__gamma': 0.001,
 'preprocessing': StandardScaler()}

In [6]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', LogisticRegression(C=1))])

param_grid = {
            'preprocessing': [StandardScaler(), None],
            'classifier__C': [1, 10, 100, 1000, 10000]
}

grid_3 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_3.fit(X_train, y_train)
grid_3.best_params_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'classifier__C': 100, 'preprocessing': None}

In [7]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', LinearSVC(C=1))])

param_grid = {
            'preprocessing': [StandardScaler(), None],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_1 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_1.fit(X_train, y_train)
grid_1.best_params_



{'classifier__C': 0.01, 'preprocessing': StandardScaler()}

In [8]:
from sklearn.neighbors import KNeighborsClassifier as KNN

pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', KNN())])

param_grid = {
            'preprocessing': [StandardScaler()],
            'classifier__n_neighbors': [n for n in range(2,20)]
}

grid_4 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_4.fit(X_train, y_train)
grid_4.best_params_

{'classifier__n_neighbors': 18, 'preprocessing': StandardScaler()}

In [9]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', DecisionTreeClassifier())])

param_grid = {
            'preprocessing': [StandardScaler(), None],
            'classifier__max_depth': [4,5,6,7,8,9,10,11,12,15,20,30,40],
            'classifier__criterion': ['entropy', 'gini'],
            'classifier__min_samples_leaf': [2, 3, 6]
}

grid_5 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_5.fit(X_train, y_train)
grid_5.best_params_

{'classifier__criterion': 'entropy',
 'classifier__max_depth': 5,
 'classifier__min_samples_leaf': 2,
 'preprocessing': StandardScaler()}

In [10]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', RandomForestClassifier())])

param_grid = {
            'preprocessing': [StandardScaler(), None],
            'classifier__max_depth': [4,5,6,7,8,9,10,11],
            'classifier__min_samples_leaf': [2, 3],
            'classifier__n_estimators': [10, 20]
}

grid_6 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_6.fit(X_train, y_train)
grid_6.best_params_

{'classifier__max_depth': 5,
 'classifier__min_samples_leaf': 3,
 'classifier__n_estimators': 10,
 'preprocessing': StandardScaler()}

In [11]:
from sklearn.ensemble import BaggingClassifier

pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', BaggingClassifier(DecisionTreeClassifier()))])

param_grid = {"classifier__base_estimator__max_depth": [3,20],
          "classifier__base_estimator__max_features": [None, "auto"],
          "classifier__base_estimator__min_samples_leaf": [1, 3, 10],
          'classifier__max_samples': [0.5, 1.0],
          'classifier__n_estimators': [2, 10, 20],
}

grid_7 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_7.fit(X_train, y_train)
grid_7.best_params_

{'classifier__base_estimator__max_depth': 20,
 'classifier__base_estimator__max_features': None,
 'classifier__base_estimator__min_samples_leaf': 10,
 'classifier__max_samples': 0.5,
 'classifier__n_estimators': 20}

In [12]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {
    'n_estimators': [10, 20, 50, 70, 200, 400],
    'learning_rate': [10**n for n in range(-5,5)]
}

grid_8 = GridSearchCV(AdaBoostClassifier(), param_grid, cv=kfold, return_train_score=True)
grid_8.fit(X_train, y_train)
grid_8.best_params_

  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight *= np.exp(
  retur

{'learning_rate': 0.1, 'n_estimators': 200}

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    "max_features":["log2","sqrt"],
    'learning_rate': [10**n for n in range(-4,4)],
    'max_depth': [3,4,5],
    'min_samples_leaf': [4,5,6],
    'n_estimators': [5,10,15,20]
}

grid_9 = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=kfold, return_train_score=True)
grid_9.fit(X_train, y_train)
grid_9.best_params_

{'learning_rate': 0.1,
 'max_depth': 3,
 'max_features': 'log2',
 'min_samples_leaf': 5,
 'n_estimators': 20}

In [14]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression


clf = VotingClassifier(estimators=[ 
    ('lr',LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ], voting='soft')

param_grid = {
    'lr__C': [10**n for n in range(-5,5)],
    'rf__n_estimators': [5, 10, 15, 20, 50]
}

grid_10 = GridSearchCV(clf, param_grid, cv=kfold, return_train_score=True)
grid_10.fit(X_train, y_train)
grid_10.best_params_


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'lr__C': 100, 'rf__n_estimators': 10}

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import uniform, randint

param_distribution = {
    'max_depth': randint(3, 11),
    'learning_rate': uniform(0.001, 0.1-0.001),
    'n_estimators': randint(50, 400),
    'gamma': uniform(0,2),
    'colsample_bytree': uniform(0.5, 0.5),
    'subsample': uniform(0.5, 0.5),
    'min_child_weight': randint(1, 11)
}
from xgboost import XGBClassifier

grid_11 = RandomizedSearchCV(XGBClassifier(), param_distributions=param_distribution, cv=kfold,return_train_score=True)
grid_11.fit(X_train, y_train)
grid_11.best_params_

{'colsample_bytree': 0.5621708875881891,
 'gamma': 1.963258607524871,
 'learning_rate': 0.06482850219351063,
 'max_depth': 8,
 'min_child_weight': 6,
 'n_estimators': 73,
 'subsample': 0.8618463018420285}

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

param_grid = {
    'n_estimators':[10,20,30,40,50,100],
    'max_depth': [4,5,10,20],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
}

grid_12 = GridSearchCV(ExtraTreesClassifier(), param_grid=param_grid, cv=kfold,return_train_score=True)
grid_12.fit(X_train, y_train)
grid_12.best_params_

{'bootstrap': True, 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 40}

In [17]:
from sklearn import  metrics


models = []
models.append(('SVM linear', grid_1.best_estimator_))
models.append(('SVM rbf', grid_2.best_estimator_))
models.append(('LR', grid_3.best_estimator_))
models.append(('KNN', grid_4.best_estimator_))
models.append(('DecisionTreeClassifier', grid_5.best_estimator_))
models.append(('BaggingClassifier', grid_7.best_estimator_))
models.append(('RandomForestClassifier', grid_6.best_estimator_))
models.append(('ExtraTreesClassifier', grid_12.best_estimator_))
models.append(('AdaBoostClassifier', grid_8.best_estimator_))
models.append(('GradientBoostingClassifier', grid_9.best_estimator_))
models.append(('XGBClassifier', grid_11.best_estimator_))
models.append(('voting_clf', grid_10.best_estimator_))

precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
roc_auc_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test , model.predict(X_test)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test , model.predict(X_test)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test , model.predict(X_test)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test , model.predict(X_test)) ))
    
    if (name == 'SVM linear'):
        print("roc_auc_score: {}".format( metrics.roc_auc_score(y_test , model.decision_function(X_test)) ))            
    else:
        print("roc_auc_score: {}".format( metrics.roc_auc_score(y_test , model.predict_proba(X_test)[:,1]) ))
    
    precision_score.append(metrics.precision_score(y_test , model.predict(X_test)))
    recall_score.append(metrics.recall_score(y_test , model.predict(X_test)))
    f1_score.append( metrics.f1_score(y_test , model.predict(X_test)))
    accuracy_score.append(metrics.accuracy_score(y_test , model.predict(X_test)))
    if (name == 'SVM linear'):
        roc_auc_score.append(metrics.roc_auc_score(y_test , model.decision_function(X_test)))        
    else:    
        roc_auc_score.append(metrics.roc_auc_score(y_test , model.predict_proba(X_test)[:,1]))

SVM linear
precision_score: 0.7215189873417721
recall_score: 0.6195652173913043
f1_score: 0.6666666666666666
accuracy_score: 0.7755905511811023
roc_auc_score: 0.8313875469672571
SVM rbf
precision_score: 0.7605633802816901
recall_score: 0.5869565217391305
f1_score: 0.6625766871165645
accuracy_score: 0.7834645669291339
roc_auc_score: 0.8350442834138486
LR
precision_score: 0.7108433734939759
recall_score: 0.6413043478260869
f1_score: 0.6742857142857143
accuracy_score: 0.7755905511811023
roc_auc_score: 0.8271604938271605
KNN
precision_score: 0.7352941176470589
recall_score: 0.5434782608695652
f1_score: 0.625
accuracy_score: 0.7637795275590551
roc_auc_score: 0.8052871712292001
DecisionTreeClassifier
precision_score: 0.6984126984126984
recall_score: 0.4782608695652174
f1_score: 0.5677419354838709
accuracy_score: 0.7362204724409449
roc_auc_score: 0.7930756843800323
BaggingClassifier
precision_score: 0.6976744186046512
recall_score: 0.6521739130434783
f1_score: 0.6741573033707865
accuracy_scor

In [18]:
import pandas as pd
d = {'precision_score': precision_score, 
     'recall_score': recall_score, 
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score,
     'roc_auc_score' : roc_auc_score
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['SVM linear','SVM rbf','LR','KNN', 'DecisionTreeClassifier','BaggingClassifier','RandomForestClassifier','ExtraTreesClassifier', 'AdaBoostClassifier','GradientBoostingClassifier','XGBClassifier', 'voting'])
df

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy_score,roc_auc_score
0,SVM linear,0.721519,0.619565,0.666667,0.775591,0.831388
1,SVM rbf,0.760563,0.586957,0.662577,0.783465,0.835044
2,LR,0.710843,0.641304,0.674286,0.775591,0.82716
3,KNN,0.735294,0.543478,0.625,0.76378,0.805287
4,DecisionTreeClassifier,0.698413,0.478261,0.567742,0.73622,0.793076
5,BaggingClassifier,0.697674,0.652174,0.674157,0.771654,0.82649
6,RandomForestClassifier,0.678571,0.619565,0.647727,0.755906,0.812735
7,ExtraTreesClassifier,0.784314,0.434783,0.559441,0.751969,0.827697
8,AdaBoostClassifier,0.651685,0.630435,0.640884,0.744094,0.813775
9,GradientBoostingClassifier,0.71831,0.554348,0.625767,0.759843,0.840982


# MLPClassifier

Dodajmy model sieci neuronowej

In [19]:
from sklearn.neural_network import MLPClassifier

In [20]:
model = MLPClassifier((20,10))
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:,1]
predictions = y_pred.round()

accuracy = metrics.accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0), "AUC: ", metrics.roc_auc_score(y_score=y_pred,y_true=y_test))

Accuracy: 68.11% AUC:  0.6979334406870639




# Zad
Wykonaj Walidację krzyżową

In [21]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', MLPClassifier())])

param_grid = {
            'preprocessing': [StandardScaler(), None],
            'classifier__hidden_layer_sizes': [(20,10)],
            'classifier__learning_rate_init': [0.001],#, 0.01, 0.1],
            'classifier__max_iter': [100],
            'classifier__batch_size': [8, 16,32],
}

grid_13 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_13.fit(X_train, y_train)
grid_13.best_params_



{'classifier__batch_size': 32,
 'classifier__hidden_layer_sizes': (20, 10),
 'classifier__learning_rate_init': 0.001,
 'classifier__max_iter': 100,
 'preprocessing': StandardScaler()}

In [22]:
metrics.accuracy_score(y_test, grid_2.best_estimator_.predict(X_test))

0.7834645669291339

In [23]:
from sklearn import  metrics


models = []
models.append(('SVM linear', grid_1.best_estimator_))
models.append(('SVM rbf', grid_2.best_estimator_))
models.append(('LR', grid_3.best_estimator_))
models.append(('KNN', grid_4.best_estimator_))
models.append(('DecisionTreeClassifier', grid_5.best_estimator_))
models.append(('BaggingClassifier', grid_7.best_estimator_))
models.append(('RandomForestClassifier', grid_6.best_estimator_))
models.append(('ExtraTreesClassifier', grid_12.best_estimator_))
models.append(('AdaBoostClassifier', grid_8.best_estimator_))
models.append(('GradientBoostingClassifier', grid_9.best_estimator_))
models.append(('XGBClassifier', grid_11.best_estimator_))
models.append(('voting_clf', grid_10.best_estimator_))
models.append(('MLP', grid_13.best_estimator_))

precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
roc_auc_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test , model.predict(X_test)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test , model.predict(X_test)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test , model.predict(X_test)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test , model.predict(X_test)) ))
    
    if (name == 'SVM linear'):
        print("roc_auc_score: {}".format( metrics.roc_auc_score(y_test , model.decision_function(X_test)) ))            
    else:
        print("roc_auc_score: {}".format( metrics.roc_auc_score(y_test , model.predict_proba(X_test)[:,1]) ))
    
    precision_score.append(metrics.precision_score(y_test , model.predict(X_test)))
    recall_score.append(metrics.recall_score(y_test , model.predict(X_test)))
    f1_score.append( metrics.f1_score(y_test , model.predict(X_test)))
    accuracy_score.append(metrics.accuracy_score(y_test , model.predict(X_test)))
    if (name == 'SVM linear'):
        roc_auc_score.append(metrics.roc_auc_score(y_test , model.decision_function(X_test)))        
    else:    
        roc_auc_score.append(metrics.roc_auc_score(y_test , model.predict_proba(X_test)[:,1]))

SVM linear
precision_score: 0.7215189873417721
recall_score: 0.6195652173913043
f1_score: 0.6666666666666666
accuracy_score: 0.7755905511811023
roc_auc_score: 0.8313875469672571
SVM rbf
precision_score: 0.7605633802816901
recall_score: 0.5869565217391305
f1_score: 0.6625766871165645
accuracy_score: 0.7834645669291339
roc_auc_score: 0.8350442834138486
LR
precision_score: 0.7108433734939759
recall_score: 0.6413043478260869
f1_score: 0.6742857142857143
accuracy_score: 0.7755905511811023
roc_auc_score: 0.8271604938271605
KNN
precision_score: 0.7352941176470589
recall_score: 0.5434782608695652
f1_score: 0.625
accuracy_score: 0.7637795275590551
roc_auc_score: 0.8052871712292001
DecisionTreeClassifier
precision_score: 0.6984126984126984
recall_score: 0.4782608695652174
f1_score: 0.5677419354838709
accuracy_score: 0.7362204724409449
roc_auc_score: 0.7930756843800323
BaggingClassifier
precision_score: 0.6976744186046512
recall_score: 0.6521739130434783
f1_score: 0.6741573033707865
accuracy_scor

In [24]:
import pandas as pd
d = {'precision_score': precision_score, 
     'recall_score': recall_score, 
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score,
     'roc_auc_score' : roc_auc_score
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['SVM linear','SVM rbf','LR','KNN', 'DecisionTreeClassifier','BaggingClassifier','RandomForestClassifier','ExtraTreesClassifier', 'AdaBoostClassifier','GradientBoostingClassifier','XGBClassifier', 'voting', 'MLP'])
df

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy_score,roc_auc_score
0,SVM linear,0.721519,0.619565,0.666667,0.775591,0.831388
1,SVM rbf,0.760563,0.586957,0.662577,0.783465,0.835044
2,LR,0.710843,0.641304,0.674286,0.775591,0.82716
3,KNN,0.735294,0.543478,0.625,0.76378,0.805287
4,DecisionTreeClassifier,0.698413,0.478261,0.567742,0.73622,0.793076
5,BaggingClassifier,0.697674,0.652174,0.674157,0.771654,0.82649
6,RandomForestClassifier,0.678571,0.619565,0.647727,0.755906,0.812735
7,ExtraTreesClassifier,0.784314,0.434783,0.559441,0.751969,0.827697
8,AdaBoostClassifier,0.651685,0.630435,0.640884,0.744094,0.813775
9,GradientBoostingClassifier,0.71831,0.554348,0.625767,0.759843,0.840982


# Wczytaj dane treningowe i testowe

In [25]:
# Wczytaj dane treningowe i testowe

import pandas as pd

train_set = pd.read_csv('Dane/adult/adult.data', sep=", ",header = None)
test_set = pd.read_csv('Dane/adult/adult.test', sep=", ",skiprows = 1, header = None) # Make sure to skip a row for the test set


col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
              'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
             'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

train = train_set.replace('?', np.nan).dropna()
test = test_set.replace('?', np.nan).dropna()



dataset = pd.concat([train,test])

dataset['wage_class'] = dataset.wage_class.replace({'<=50K.': 0,'<=50K':0, '>50K.':1, '>50K':1})

dataset.drop(["fnlwgt"],axis=1,inplace=True)

dataset.drop(["education"],axis=1,inplace=True)

x = dataset.groupby('native_country')["wage_class"].mean()

d = dict(pd.cut(x[x.index!=" United-States"],5,labels=range(5)))

dataset['native_country'] = dataset['native_country'].replace(d)

dataset = pd.get_dummies(dataset,drop_first=True)

train = dataset.iloc[:train.shape[0]]
test = dataset.iloc[train.shape[0]:]

X_train = train.drop("wage_class",axis=1)
y_train = train.wage_class

X_test = test.drop("wage_class",axis=1)
y_test = test.wage_class

# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

# print(X_train.shape)
# X_test.shape

  train_set = pd.read_csv('Dane/adult/adult.data', sep=", ",header = None)
  test_set = pd.read_csv('Dane/adult/adult.test', sep=", ",skiprows = 1, header = None) # Make sure to skip a row for the test set


In [26]:
print(X_train.shape)
print(X_test.shape)

(30162, 41)
(15060, 41)


# Zad
Porównaj wyniki sieci na:
* oryginalnych danych 
* na wystandaryzowanych

In [27]:
#oryginalne

model = MLPClassifier((20,10))
model.fit(X_train, y_train)


from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, model.predict(X_test)))

0.8363877822045153


In [28]:
#skalowane

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

model = MLPClassifier((20,10))
model.fit(X_train, y_train)


from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, model.predict(X_test)))

0.84667994687915


