In [None]:
%%bash
# Crear carpeta Mllib hadoop

hdfs dfs -mkdir /FuentesMllib

In [None]:
%%bash
#Subir archivos de spam.txt y normal.txt a la carpeta FuentesMllib en hadoop

hdfs dfs -copyFromLocal spam.txt /FuentesMllib
hdfs dfs -copyFromLocal normal.txt /FuentesMllib

In [54]:
sc.stop()

#### Iniciamos Spark

In [55]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkContext
sc =SparkContext()

In [56]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import SVMWithSGD

In [57]:
# Leer archivos
spam = sc.textFile("hdfs://localhost:9000/FuentesMllib/spam.txt")
normal = sc.textFile("hdfs://localhost:9000/FuentesMllib/normal.txt")

In [58]:
# Se crea una instancia HashingTF para asignar un numero hasta 20000 palabras diferentes

tf = HashingTF(numFeatures = 10000)

In [59]:
# Se divide los archivo por palabras y a cada palabra se le aplica la transformacion a numero

spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
normalFeatures = normal.map(lambda email: tf.transform(email.split(" ")))

In [60]:
# En este caso se crean conjuntos de datos con una estructura LabeledPoint
# Se asigna 1 a spam y 0 a normal

positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features))

### Las etiquetas son:
### SPAM = 1
### NORMAL = 0

In [61]:
# Se hace la union de los LabeledPoint
trainingData = positiveExamples.union(negativeExamples)

In [62]:
trainingData.cache()

UnionRDD[6] at union at NativeMethodAccessorImpl.java:0

In [63]:
#Ejecuta algoritmo SVM (Support Vector Machine) con SGD() para hacer el entenamiento 

model = SVMWithSGD.train(trainingData)

### Probamos para un ejemplo positivio de spam y luego uno negativo

In [64]:
#Aplicamos la misma trasformacion HashingTF para obtener los Vectores y luego aplicamos el modelo

posTest = tf.transform("Un servicio personal de compras que provee legalmente drogas sin prescripcin mdica desde Canad y el resto del mundo. Haz tu pedido de Valium (Diazepam) y te garantizamos que lo tendrs en tu casa en 7 DAS.".split(" "))

print("Prediction for positive test example: %g" % model.predict(posTest))

Prediction for positive test example: 1


In [65]:
# Test de un correo normal
negTest = tf.transform("Un servicio personal de compras que provee legalmente drogas sin prescripcin mdica desde Canad y el resto del mundo. Haz tu pedido de Valium (Diazepam) y te garantizamos que lo tendrs en tu casa en 7 DAS.".split(" "))

print ("Prediction for negative test examples: %g" % model.predict(negTest))


Prediction for negative test examples: 1


In [53]:
print(model)

(weights=[-0.9968495872287483,-0.011182099836551482,0.0,0.0,0.010630072716448015,0.0,-0.005130488602606819,0.0,0.0,0.0,-0.002449802580825011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.022364199673102964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.05221832572049423,0.0,0.0,0.0028298749993622735,0.0,0.0,-0.0009451490318235994,-0.01478501129113368,0.0,0.0,0.0,-0.012286154076758385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022364199673102964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.017693566892790134,-0.0229162267932064,0.011182099836551482,0.0,0.011182099836551482,0.0,0.0,-0.0229162267932064,-0.009952764372736787,0.0,0.0,-0.008296553230982443,0.011182099836551482,0.0,0.0,0.0,0.0,0.0,-0.011182099836551482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.012127248868375072,0.0,0.0,0.0,0.0,0.0,0.0,-0.0029055099674980206,0.0,-0.009952764372736787,0.0,0.019711151353397304,0.019762969280968976,0.0,0.0,0.011182099836551482,0.0,0.0,0.0,0.0,-0.033546299509654445,-0.001548994079498735,0.0,0.0,0.0,0.022364199673102964,0.0,0.0,0.0,0.0,-