# Classificação de Span em base de e-mail
#### O objetivo deste trabalho era verificar qual entre os modelos de classificação abaixo (Regressão Linear, SVM e Rede Neural) performaria melhor para classificação de span em uma base de e-mails em inglês.

## Regressão Logística

In [1]:
#Importação de pacotes
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split



In [2]:
#Carregando a base de dados em um dataframe e separando as colunas de dados (text) e classificação (span)
data = pd.read_csv("emails.csv")
text = data[data.columns[0]]
spam = data[data.columns[1]]

In [3]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text)
X_train_counts.shape

(5728, 37303)

In [4]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tf, spam, test_size = 0.20)

In [6]:
clf = LogisticRegression(solver='saga').fit(X_train, y_train)


In [8]:
clf.predict(X_train[:10])

array([1, 0, 1, 1, 0, 0, 1, 0, 1, 0], dtype=int64)

In [9]:
clf.predict_proba(X_train[:10])

array([[0.06667826, 0.93332174],
       [0.9984576 , 0.0015424 ],
       [0.19958928, 0.80041072],
       [0.11988336, 0.88011664],
       [0.9805707 , 0.0194293 ],
       [0.9196491 , 0.0803509 ],
       [0.29700872, 0.70299128],
       [0.98417936, 0.01582064],
       [0.06531299, 0.93468701],
       [0.63140089, 0.36859911]])

In [10]:
clf.score(X_train, y_train)

0.9853775643823658

In [11]:
#Testando com a base de teste
clf.predict(X_test[:10])

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [12]:
clf.predict_proba(X_test[:10])

array([[0.07139284, 0.92860716],
       [0.2756595 , 0.7243405 ],
       [0.9985782 , 0.0014218 ],
       [0.96505403, 0.03494597],
       [0.99705617, 0.00294383],
       [0.97723935, 0.02276065],
       [0.98819088, 0.01180912],
       [0.99728576, 0.00271424],
       [0.96189246, 0.03810754],
       [0.97909825, 0.02090175]])

In [13]:
clf.score(X_test, y_test)

0.9851657940663177

## SVM

In [14]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import svm
from sklearn.cross_validation import train_test_split

In [15]:
data = pd.read_csv("emails.csv")
text = data[data.columns[0]]
spam = data[data.columns[1]]

In [16]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text)
X_train_counts.shape

(5728, 37303)

In [17]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tf, spam, test_size = 0.20)

In [19]:
clf = svm.SVC(probability=True)
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
clf.predict(X_train[:10])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [21]:
clf.predict_proba(X_train[:10])

array([[0.94765859, 0.05234141],
       [0.00468099, 0.99531901],
       [0.80411793, 0.19588207],
       [0.64678779, 0.35321221],
       [0.95652797, 0.04347203],
       [0.99217161, 0.00782839],
       [0.99120421, 0.00879579],
       [0.9407162 , 0.0592838 ],
       [0.06610603, 0.93389397],
       [0.67158279, 0.32841721]])

In [22]:
clf.score(X_train, y_train)

0.7623308598865124

In [23]:
#Testando com a base de teste
clf.predict(X_test[:10])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [24]:
clf.predict_proba(X_test[:10])

array([[7.08660405e-01, 2.91339595e-01],
       [8.37428803e-01, 1.62571197e-01],
       [9.71788311e-01, 2.82116892e-02],
       [9.98802474e-01, 1.19752633e-03],
       [5.96749972e-02, 9.40325003e-01],
       [1.87958211e-08, 9.99999981e-01],
       [3.44551717e-02, 9.65544828e-01],
       [6.27037154e-01, 3.72962846e-01],
       [9.64487240e-01, 3.55127599e-02],
       [9.92466422e-01, 7.53357758e-03]])

In [25]:
clf.score(X_test, y_test)

0.756544502617801

## Rede Neural - Multi-layer Perceptron

In [26]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.cross_validation import train_test_split

In [27]:
data = pd.read_csv("emails.csv")
text = data[data.columns[0]]
spam = data[data.columns[1]]

In [28]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text)
X_train_counts.shape

(5728, 37303)

In [29]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tf, spam, test_size = 0.20)

In [31]:
clf = MLPClassifier(solver='adam')
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [32]:
clf.predict(X_train[:10])

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [33]:
clf.predict_proba(X_train[:10])

array([[9.99249887e-01, 7.50113499e-04],
       [8.69073125e-04, 9.99130927e-01],
       [1.09533311e-03, 9.98904667e-01],
       [1.40975296e-05, 9.99985902e-01],
       [9.98979023e-01, 1.02097718e-03],
       [9.99989567e-01, 1.04331583e-05],
       [9.80516385e-01, 1.94836147e-02],
       [9.97864624e-01, 2.13537594e-03],
       [9.94720466e-01, 5.27953408e-03],
       [9.99504027e-01, 4.95973058e-04]])

In [34]:
clf.score(X_train, y_train)

1.0

In [35]:
#Testando com a base de teste
clf.predict(X_test[:10])

array([1, 0, 1, 0, 0, 0, 0, 1, 1, 0], dtype=int64)

In [36]:
clf.predict_proba(X_test[:10])

array([[1.43152117e-05, 9.99985685e-01],
       [9.99982692e-01, 1.73076529e-05],
       [6.26062177e-03, 9.93739378e-01],
       [9.99983574e-01, 1.64261430e-05],
       [9.99836504e-01, 1.63495580e-04],
       [9.99929418e-01, 7.05820382e-05],
       [9.99804056e-01, 1.95943816e-04],
       [8.81695954e-04, 9.99118304e-01],
       [5.55792577e-04, 9.99444207e-01],
       [9.86586176e-01, 1.34138239e-02]])

In [37]:
clf.score(X_test, y_test)

0.9965095986038395