### Ejemplos sesión Text Analytics Ib
#### Clasificación supervisada de textos

In [46]:
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

### Definimos una función de tokenización customizada

In [2]:
def tokenize(sentence):
    tokens = nltk.word_tokenize(sentence)
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens2 = [token 
               for token in tokens  if token not in stop_words]
    stems = [ps.stem(token) for token in tokens2 if token.isalnum()]
    return stems

# Clasificacion

#### Utilizando las técnicas de vectorización "clásicas" vistas en la sesión, vamos a construir un sencillo clasificador binario para determinar si un correo es spam o no. Utilizaremos el dataset spam/no spam de UCI Machine Learning Directory (https://archive.ics.uci.edu/ml/index.php) 

In [3]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix 

### Cargamos los datos

In [4]:
url = urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip")
zipfile = ZipFile(BytesIO(url.read()))
spam = [line.decode('utf-8') for line in zipfile.open('SMSSpamCollection').readlines()]
for line in spam[:5]: 
    print(line)


ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

ham	Ok lar... Joking wif u oni...

spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

ham	U dun say so early hor... U c already then say...

ham	Nah I don't think he goes to usf, he lives around here though



In [5]:
# Cargamos los datos en un pandas
spam_df = pd.DataFrame([line.split('\t') for line in spam], columns=['Y','X']) 
pd.Categorical(spam_df.Y).describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4827,0.865985
spam,747,0.134015


# Hacemos un split en train y test

In [6]:
spam_train, spam_test = train_test_split(spam_df)

In [7]:
# Tokenizador básico
def tokenize_classification(sentence):
    tokens = nltk.word_tokenize(sentence)
    return tokens

### Obtenemos la representación numérica de los sets de train y test

In [11]:
spam_vect = CountVectorizer(tokenizer=tokenize, token_pattern=None) 
spam_vect.fit(spam_train.X.values)
X_train_counts = spam_vect.transform(spam_train.X.values)
X_test_counts = spam_vect.transform(spam_test.X.values)

### Ajustamos un clasificador Naive Bayes

In [12]:
clf = MultinomialNB().fit(X_train_counts, spam_train.Y)

In [18]:
### Evaluamos el resultado con los datos de test

In [13]:
Y_test_predicted = clf.predict(X_test_counts)
pd.DataFrame(confusion_matrix(spam_test.Y, Y_test_predicted))

Unnamed: 0,0,1
0,1198,5
1,9,182


### Su turno. Trate de mejorar el resultado mediante :
#### 1) Preprocesado del texto y/o obtención de nuevas features
#### 2) Uso de otros modelos de vectorización (pe TF/IDF) 
#### 3) Usando otros modelos de clasificación (pe SVM , Logistic Regression, RandomForest,...)

In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix 

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def tokenize(sentence):
    tokens = nltk.word_tokenize(sentence)
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens2 = [token 
               for token in tokens  if token not in stop_words]
    stems = [ps.stem(token) for token in tokens2 if token.isalnum()]
    return stems

In [None]:
url = urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip")
zipfile = ZipFile(BytesIO(url.read()))
spam = [line.decode('utf-8') for line in zipfile.open('SMSSpamCollection').readlines()]
for line in spam[:5]: 
    print(line)

In [51]:
def preprocess_text(sentence):
    tokens = nltk.word_tokenize(sentence.lower())  # Convertir a minúsculas
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens2 = [token for token in tokens if token.isalnum() and token not in stop_words]  # Eliminar puntuación y stopwords
    stems = [ps.stem(token) for token in tokens2]  # Aplicar stemming
    return ' '.join(stems)

In [52]:
url = urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip")
zipfile = ZipFile(BytesIO(url.read()))
spam = [line.decode('utf-8') for line in zipfile.open('SMSSpamCollection').readlines()]

In [53]:
spam_df['X_processed'] = spam_df['X'].apply(preprocess_text)

In [78]:
spam_train, spam_test = train_test_split(spam_df, test_size=0.1, random_state=45)

In [79]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(spam_train['X_processed'])
X_test = vectorizer.transform(spam_test['X_processed'])

In [80]:
logreg = LogisticRegression()
logreg.fit(X_train, spam_train['Y'])
logreg_predictions = logreg.predict(X_test)
logreg_confusion_matrix = confusion_matrix(spam_test['Y'], logreg_predictions)

In [81]:
print("\nRandom Forest Confusion Matrix:")
print(rf_confusion_matrix)


Random Forest Confusion Matrix:
[[482   0]
 [ 10  66]]
