In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.cross_validation import train_test_split



In [2]:
data = pd.read_csv("emails.csv")
text = data[data.columns[0]]
spam = data[data.columns[1]]

In [3]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text)
X_train_counts.shape

(5728, 37303)

In [4]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tf, spam, test_size = 0.20)

In [6]:
clf = MLPClassifier(solver='adam')
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [7]:
clf.predict(X_train[:10])

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1], dtype=int64)

In [8]:
clf.predict_proba(X_train[:10])

array([[9.98679968e-01, 1.32003214e-03],
       [9.99641549e-01, 3.58451280e-04],
       [9.99908392e-01, 9.16077955e-05],
       [9.99970282e-01, 2.97176824e-05],
       [9.99963661e-01, 3.63386162e-05],
       [1.34996976e-04, 9.99865003e-01],
       [9.99932882e-01, 6.71180464e-05],
       [9.99857681e-01, 1.42318796e-04],
       [9.99897245e-01, 1.02755221e-04],
       [5.42866194e-04, 9.99457134e-01]])

In [9]:
clf.score(X_train, y_train)

1.0

In [10]:
#Testando com a base de teste
clf.predict(X_test[:10])

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

In [11]:
clf.predict_proba(X_test[:10])

array([[9.99778838e-01, 2.21161531e-04],
       [9.97233812e-01, 2.76618759e-03],
       [9.99598397e-01, 4.01602844e-04],
       [9.99930696e-01, 6.93035615e-05],
       [7.70510898e-03, 9.92294891e-01],
       [9.99994000e-01, 6.00021212e-06],
       [9.99978433e-01, 2.15669057e-05],
       [9.95667890e-01, 4.33210994e-03],
       [5.19832839e-02, 9.48016716e-01],
       [9.99876575e-01, 1.23425151e-04]])

In [12]:
clf.score(X_test, y_test)

0.9965095986038395