In [244]:
import pandas as pd
import spacy
import spacy.cli

In [245]:
spacy.cli.download('es_core_news_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')


In [246]:
nlp=spacy.load('es_core_news_sm')

In [247]:
strSpamDb = './es_spam.csv'
spamDb = pd.read_csv(strSpamDb, sep = ',', names = ['label','message'])
spamDb = spamDb.iloc[1:]
spamDb.head()

Unnamed: 0,label,message
1,ham,"Ir hasta el punto de jurong, loco .. Disponibl..."
2,ham,lar bien ... Bromas WIF u oni ...
3,spam,Entrada libre en una imagen de obsequio 2 wkly...
4,ham,T Dun decir hor tan temprano ... t r ya contin...
5,ham,"Nah no creo que vaya a USF, que vive por aquí,..."


In [248]:
corpus = spamDb.message

In [249]:
spamDb.label.value_counts()/len(spamDb)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [250]:
ham = spamDb[spamDb['label']=='ham']
spam = spamDb[spamDb['label']=='spam']
ham.shape, spam.shape

((4825, 2), (747, 2))

In [251]:
ham = ham.sample(2*spam.shape[0])
ham.shape, spam.shape

((1494, 2), (747, 2))

In [252]:
dataset = ham.append(spam, ignore_index= True)
dataset.shape

(2241, 2)

# Normalizar Texto

In [253]:
corpus = dataset.message
corpus[0:10]

0              Hey ahora estoy libre me puedes llamar.
1    Entender. su pérdida es mi aumento :) así que ...
2    Wen ur bcums adorables wid u enojado, DNT toma...
3                            Viendo ajith ah película?
4    Nuevo coche y la casa de mis parents.:)i sólo ...
5                                   Hmm pensar Lor ...
6           Que le gustaría que no lo haría? ¡Imbécil!
7                     K:) que k.are en la universidad?
8                  La India tiene que tomar ventaja :)
9    ¡Ten una buena tarde! Hablaré contigo más s tarde
Name: message, dtype: object

In [254]:
import re
def Normalizacion(corpus):
    newCorpus = []
    for doc in corpus:
        newCorpus.append(re.sub(r'[^a-zA-Z0-9\s{1}áéíóúüÁÉÍÓÚñ]', '', doc).lower().strip().rstrip('\n').rstrip('\r\n'))
    return newCorpus

In [255]:
corpusNorm = Normalizacion(corpus)
corpusNorm[0:10]

['hey ahora estoy libre me puedes llamar',
 'entender su pérdida es mi aumento  así que trabaja usted colegio',
 'wen ur bcums adorables wid u enojado dnt toman en serio  coz estar enojado es d manera más infantil n verdadera de mostrar afecto profundo cuidado n luv  kettoda manda  tienes buen día da',
 'viendo ajith ah película',
 'nuevo coche y la casa de mis parentsi sólo tienen nuevo trabajo en la mano',
 'hmm pensar lor',
 'que le gustaría que no lo haría imbécil',
 'k que kare en la universidad',
 'la india tiene que tomar ventaja',
 'ten una buena tarde hablaré contigo más s tarde']

# Tokenizacion

In [256]:
def tokenizacion(corpus):
    documents = []
    for doc in corpus:
        documents.append(nlp(doc))
    return documents

In [257]:
corpusToken = tokenizacion(corpusNorm)
corpusToken[0:10]

[hey ahora estoy libre me puedes llamar,
 entender su pérdida es mi aumento  así que trabaja usted colegio,
 wen ur bcums adorables wid u enojado dnt toman en serio  coz estar enojado es d manera más infantil n verdadera de mostrar afecto profundo cuidado n luv  kettoda manda  tienes buen día da,
 viendo ajith ah película,
 nuevo coche y la casa de mis parentsi sólo tienen nuevo trabajo en la mano,
 hmm pensar lor,
 que le gustaría que no lo haría imbécil,
 k que kare en la universidad,
 la india tiene que tomar ventaja,
 ten una buena tarde hablaré contigo más s tarde]

# Remove Stop Words

In [258]:
def removerSW(corpus):
    documents = []
    for doc in corpus:
        s = ""
        for token in doc:
            if ( token.is_stop == False):
                s = s + token.text + " "
        documents.append(s.strip())
    return documents

In [259]:
corpusSinSW = removerSW(corpusToken)
corpusSinSW[0:10]

['hey libre puedes llamar',
 'entender pérdida aumento   colegio',
 'wen ur bcums adorables wid u enojado dnt toman serio   coz enojado d infantil n mostrar afecto profundo cuidado n luv   kettoda manda   tienes',
 'viendo ajith ah película',
 'coche y casa parentsi mano',
 'hmm pensar lor',
 'gustaría haría imbécil',
 'k kare universidad',
 'india tomar ventaja',
 'ten hablaré s']

# Stemmin y Lemmatizacion

In [260]:
def stemmingLem(corpus):
    documents = tokenizacion(corpus)
    newDocs = []
    for doc in documents:
        s = ""
        for token in doc:
            s = s + token.lemma_ + " "
        newDocs.append(s.strip())
    return newDocs

In [261]:
corpusStemm = stemmingLem(corpusSinSW)
corpusStemm[0:10]

['hey librar poder llamar',
 'entender pérdida aumentar    colegiar',
 'wen ur bcums adorable wid u enojar dnt tomar seriar    coz enojar d infantil n mostrar afectar profundar cuidar n luv    kettoda mandar    tener',
 'ver ajith ah película',
 'coche y casar parentsi manir',
 'hmm pensar lor',
 'gustar hacer imbécil',
 'k kare universidad',
 'indio tomar ventaja',
 'tener hablar segundo']

In [262]:
def listToStr(s):
    strX = ""
    for palabra in s:
        strX = strX + palabra + " "
    return strX

In [263]:
strCorpus = listToStr(corpusStemm)
strCorpus = strCorpus.strip().split(' ')

In [264]:
setCorpus = set(strCorpus)

In [265]:
corpusCols = list(setCorpus)
corpusRows = range(0, len(corpusStemm))

In [266]:
def generateEmptyTF(cols,rows):
    outDf = pd.DataFrame(index=rows, columns= cols)
    outDf = outDf.fillna(0)
    return outDf

In [267]:
tfCorpus = generateEmptyTF(corpusCols, corpusRows)
tfCorpus

Unnamed: 0,Unnamed: 1,vikky,revisión,azúcar,vodafone,09063440451,venir,restaurante,wwwringtonekingcouk,box95qu,...,sms,lil,energy,sar,wed,deeraj,razonable,bus822656166382,7cfca1a,ngage
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2239,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [268]:
def calculoTF(corpus, df):
    corpus = tokenizacion(corpus)
    for index, doc in enumerate(corpus):
        bagOWLen = len(doc)
        for word in doc:
            try:
                colindex = list(df.columns).index(word.text)
                df.iloc[index, colindex] = df.iloc[index, colindex] + 1
            except:
                pass
        df.iloc[index, :] =  df.iloc[index, :]/bagOWLen
    return df

In [269]:
tfMatrix = calculoTF(corpusStemm, tfCorpus)
tfMatrix

Unnamed: 0,Unnamed: 1,vikky,revisión,azúcar,vodafone,09063440451,venir,restaurante,wwwringtonekingcouk,box95qu,...,sms,lil,energy,sar,wed,deeraj,razonable,bus822656166382,7cfca1a,ngage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [270]:
import numpy as np
def calculoIdf(df):
    N = df.shape[0]
    valX = (N/df.astype(bool).sum(axis=0))
    idfValues = pd.Series(np.log(valX))
    return idfValues

In [271]:
corpusIdf = calculoIdf(tfMatrix)
corpusIdf

                   7.021530
vikky              6.328383
revisión           5.768767
azúcar             6.616065
vodafone           5.635236
                     ...   
deeraj             6.616065
razonable          6.616065
bus822656166382    6.616065
7cfca1a            6.616065
ngage              6.616065
Length: 5261, dtype: float64

In [272]:
tfIdfCoprus = tfMatrix.mul(corpusIdf, axis = 1)

In [273]:
tfIdfCoprus

Unnamed: 0,Unnamed: 1,vikky,revisión,azúcar,vodafone,09063440451,venir,restaurante,wwwringtonekingcouk,box95qu,...,sms,lil,energy,sar,wed,deeraj,razonable,bus822656166382,7cfca1a,ngage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [274]:
tfIdfCoprus = tfIdfCoprus.fillna(0)

In [275]:
tfIdfCoprus

Unnamed: 0,Unnamed: 1,vikky,revisión,azúcar,vodafone,09063440451,venir,restaurante,wwwringtonekingcouk,box95qu,...,sms,lil,energy,sar,wed,deeraj,razonable,bus822656166382,7cfca1a,ngage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [276]:
tfIdfCoprus = tfIdfCoprus.replace([np.inf, -np.inf], 0)

In [277]:
tfIdfCoprus

Unnamed: 0,Unnamed: 1,vikky,revisión,azúcar,vodafone,09063440451,venir,restaurante,wwwringtonekingcouk,box95qu,...,sms,lil,energy,sar,wed,deeraj,razonable,bus822656166382,7cfca1a,ngage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [278]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(tfIdfCoprus, dataset['label'], 
                                                   test_size=0.3, random_state = 0, shuffle= True)

In [279]:
X_train.shape

(1568, 5261)

In [280]:
X_test.shape

(673, 5261)

In [281]:
def encode(x):
    if ( x == "ham"):
        return 0
    else:
        return 1


In [282]:
y_train = list(map(encode, y_train))

In [283]:
y_train[0:10]

[0, 0, 0, 1, 1, 1, 1, 0, 0, 0]

In [284]:
y_test = list(map(encode, y_test))

In [314]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=30, random_state=0)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=30, random_state=0)

In [315]:
y_preds_rf = rfc.predict(X_test)
y_preds_rf

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

# Funcion para predecir

In [317]:
def predecir(string):
    c = [ string ]
    c = stemmingLem(removerSW(tokenizacion(Normalizacion(c))))
    tfPredict = generateEmptyTF(corpusCols, [0])

    tfPredict = calculoTF(c, tfPredict)
    predictIdf = calculoIdf(tfPredict)
    tfidfpredic = tfPredict.mul(predictIdf, axis = 1)
    tfidfpredic = tfidfpredic.fillna(0)
    y_pred = rfc.predict(tfidfpredic)
    if y_pred[0] == 1 :
        print('Es Spam')
    else:
        print('Es Ham')

In [318]:
print(predecir('Esto es para avisarte que todo va a estar bien'))

Es Ham
None
