In [23]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
import pandas

In [14]:
fertility_df = pandas.read_csv('fertility_Diagnosis.txt', header=-1)
labels = ['Season','Age','Childish diseases','Accident or serious trauma','Surgical intervention',
          'High fevers in the last year','Frequency of alcohol consumption','Smoking habit',
          'Number of hours spent sitting per day ene-16','Output']
fertility_df.columns = labels
fertility_df['Output'] = fertility_df['Output'].map({'N': 0, 'O': 1}).astype(int)
fertility_df

Unnamed: 0,Season,Age,Childish diseases,Accident or serious trauma,Surgical intervention,High fevers in the last year,Frequency of alcohol consumption,Smoking habit,Number of hours spent sitting per day ene-16,Output
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,0
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,1
2,-0.33,0.50,1,0,0,0,1.0,-1,0.50,0
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,0
4,-0.33,0.67,1,1,0,0,0.8,-1,0.50,1
5,-0.33,0.67,1,0,1,0,0.8,0,0.50,0
6,-0.33,0.67,0,0,0,-1,0.8,-1,0.44,0
7,-0.33,1.00,1,1,1,0,0.6,-1,0.38,0
8,1.00,0.64,0,0,1,0,0.8,-1,0.25,0
9,1.00,0.61,1,0,0,0,1.0,-1,0.25,0


In [15]:
training_data, test_data, training_output, test_output = train_test_split(fertility_df, fertility_df['Output'], test_size=0.3)
del training_data['Output']
del test_data['Output']

In [16]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    return feature_matrix / norms, norms

In [17]:
normalized_training_data, norms = normalize_features(training_data)

In [18]:
normalized_training_data

Unnamed: 0,Season,Age,Childish diseases,Accident or serious trauma,Surgical intervention,High fevers in the last year,Frequency of alcohol consumption,Smoking habit,Number of hours spent sitting per day ene-16
44,-0.147538,0.093719,0.131306,0.182574,0.000000,0.2,0.143503,0.000000,0.081774
56,-0.048687,0.093719,0.131306,0.182574,0.000000,0.0,0.114802,0.000000,0.197841
48,-0.048687,0.113170,0.131306,0.182574,0.152499,0.0,0.114802,-0.140028,0.081774
89,-0.048687,0.143231,0.131306,0.182574,0.152499,0.0,0.143503,0.140028,0.100239
22,0.147538,0.118475,0.131306,0.182574,0.000000,0.0,0.114802,-0.140028,0.065947
73,-0.147538,0.088414,0.131306,0.000000,0.000000,0.2,0.114802,-0.140028,0.116067
3,-0.048687,0.132621,0.000000,0.182574,0.152499,0.0,0.143503,-0.140028,0.100239
71,0.048687,0.122011,0.131306,0.000000,0.000000,0.2,0.143503,-0.140028,0.081774
29,0.147538,0.118475,0.000000,0.000000,0.152499,0.0,0.086102,0.000000,0.131894
49,-0.048687,0.132621,0.131306,0.182574,0.152499,0.0,0.086102,-0.140028,0.050120


In [22]:
normalized_training_data.mean(axis=0)

Season                                         -0.022320
Age                                             0.117793
Childish diseases                               0.108797
Accident or serious trauma                      0.078246
Surgical intervention                           0.093678
High fevers in the last year                    0.037143
Frequency of alcohol consumption                0.117262
Smoking habit                                  -0.054011
Number of hours spent sitting per day ene-16    0.109472
dtype: float64

In [21]:
normalized_training_data.std(axis=0)

Season                                          0.118268
Age                                             0.020410
Childish diseases                               0.049844
Accident or serious trauma                      0.091003
Surgical intervention                           0.074767
High fevers in the last year                    0.114425
Frequency of alcohol consumption                0.023303
Smoking habit                                   0.107393
Number of hours spent sitting per day ene-16    0.048321
dtype: float64

In [24]:
scaled_training_data = preprocessing.scale(training_data)

In [26]:
scaled_training_data.mean(axis=0)

array([  3.17206578e-17,   6.81994144e-17,  -1.07850237e-16,
        -3.17206578e-18,  -1.14194368e-16,  -1.66533454e-17,
         6.15380762e-16,  -6.02692499e-17,   1.39570895e-16])

In [27]:
scaled_training_data.std(axis=0)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [28]:
normalized_training_data = preprocessing.normalize(training_data)

In [29]:
normalized_training_data.mean(axis=0)

array([-0.06372877,  0.30275861,  0.37046235,  0.18653246,  0.27140812,
        0.08219438,  0.37123681, -0.16934525,  0.19083647])

In [30]:
normalized_training_data.std(axis=0)

array([ 0.36084317,  0.05280054,  0.17452312,  0.21738913,  0.21843362,
        0.25041942,  0.07407596,  0.33782803,  0.09031904])

In [7]:
clf = MLPClassifier(alpha=1e-5)
clf.fit(training_data, training_output)
print(clf)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)




In [8]:
clf.get_params()

{'activation': 'relu',
 'alpha': 1e-05,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_iter': 200,
 'momentum': 0.9,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [9]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 90 to 84
Data columns (total 9 columns):
Season                                          70 non-null float64
Age                                             70 non-null float64
Childish diseases                               70 non-null int64
Accident or serious trauma                      70 non-null int64
Surgical intervention                           70 non-null int64
High fevers in the last year                    70 non-null int64
Frequency of alcohol consumption                70 non-null float64
Smoking habit                                   70 non-null int64
Number of hours spent sitting per day ene-16    70 non-null float64
dtypes: float64(4), int64(5)
memory usage: 5.5 KB


In [11]:
training_output.describe()

count    70.000000
mean      0.128571
std       0.337142
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       1.000000
Name: Output, dtype: float64

In [12]:
test_output.describe()

count    30.000000
mean      0.100000
std       0.305129
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       1.000000
Name: Output, dtype: float64