# Ejercicio 1

In [2]:
from sklearn.datasets import fetch_20newsgroups
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import seaborn as sns

## Importación de los sets de train y test.
Se eliminan los headers y los footers.
Se verifican los tamaños de los sets y las categorías a las que corresponden los artículos.

In [3]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers'))
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers'))
print('Length train: '+ str(len(twenty_train.data)))
print('Length test: '+ str(len(twenty_test.data)))

print(twenty_train.target_names)

Length train: 11314
Length test: 7532
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [4]:
# A fin de analizar el formato de los artículos, se imprimen los primeros 5.
for i in range(5):
    print('NEW DATA')
    print(twenty_train.data[i])

NEW DATA
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.
NEW DATA
A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't ans

## Preprocesamiento.
### Tokenizador
Explicación
### Stemmer
Explicación
### Lematización
Explicacion
### Stopwords
Explicación
### Filtrado de no alfanuméricos
Explicacion

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pabli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pabli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pabli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Inicialización del lematizador y del stemmer.
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [8]:
filteredArts = list()
auxPorc = -1
for i in range(len(twenty_train.data)):
    porc = int((i/len(twenty_train.data))*100)
    if (porc!=auxPorc):
        print(str("\rProcesado: " + str(porc) + "%"), end="")
        porc = auxPorc
    tokenized = word_tokenize(twenty_train.data[i])
    lemmatized = [lemmatizer.lemmatize(tok) for tok in tokenized]
    stop = [lem for lem in lemmatized if lem not in stopwords.words('english')]
    stemmed=[stemmer.stem(w) for w in stop]
    alpha = [st for st in stemmed if st.isalpha()]
    filteredArts.append(" ".join(alpha))

Procesado: 1%

KeyboardInterrupt: 

In [7]:
#Se guarda el resultado del preprocesamiento en un archivo mediante la librería Pickle.
import pickle

with open('art_filt.pkl', 'wb') as fp:
    pickle.dump(filteredArts, fp)

In [9]:
#Se lee el resultado del preprocesamiento guardado en un archivo mediante la librería Pickle.
#import pickle
with open('art_filt.pkl','rb') as fp:
    articlesList = pickle.load(fp)

In [10]:
#Se verifica que los artículos hayan sido obtenidos correctamente.
print(str(len(articlesList)))
print(articlesList[0])

11314
I wonder anyon could enlighten car I saw day It sport car look late earli It call bricklin the door realli small In addit front bumper separ rest bodi thi I know If anyon tellm model name engin spec year product car make histori whatev info funki look car pleas


### Vectorización
Explicación de Vectorización y de Count Vectorizer
Explicación de min_df y max_df

In [46]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vectorizer = CountVectorizer(max_df = 0.1, min_df = 5)
# tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, min_df = 10)
trainedData = count_vectorizer.fit_transform(articlesList)
trainedDataArray = trainedData.toarray() #sparsed -> expanded matrix
trainedData.shape

(11314, 12637)

In [142]:
# Se imprime el vocabulario resultante en un txt con el fin de analizarlo visualmente.
with open('word_list.txt', 'w') as fp:
    for word in count_vectorizer.get_feature_names():
        fp.write(word + "\n")

### Pandas
Explicación de Pandas y su uso

In [47]:
import pandas as pd
alpha = 1
cols = count_vectorizer.get_feature_names()
df = pd.DataFrame(trainedDataArray, columns = cols)
df["targetCode"] = twenty_train.target
df.head()

Unnamed: 0,aa,aaa,aamir,aaron,ab,abandon,abbey,abbott,abbrevi,abc,...,zoom,zr,zs,zterm,zu,zubov,zv,zy,zz,targetCode
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14


## Entrenamiento del modelo.
### Naive Bayes
Explicación y fórmulas
#### Probabilidades a priori y a posteriori.
#### Likelihoods y LogLikelihoods
### Smoothing Laplaciano

In [48]:
N_cat = 20
N_arts = df.values.shape[0]
N_words = df.values.shape[1] - 1 #-1 target's columns
prioriProbs = list()
catProbs = list()

for i in range(N_cat):
    catOcurrency = sum(df.loc[df["targetCode"] == i].drop("targetCode", axis=1).values) + alpha #Sum of category ocurrencies
    prioriProbs.append(np.log(catOcurrency/sum(catOcurrency)))
    catProbs.append(np.log(df.loc[df["targetCode"] == i].shape[0]/N_arts))



## Testeo del modelo.
### Métricas
Explicación de métricas y de cual utilizamos

### Prueba con set de Train.

In [49]:
predictionOk = 0
for n_art in range(N_arts):
    maxIdx = -1
    maxLogL = -float('inf')
    for i in range(N_cat):
        logL = np.dot((trainedDataArray[n_art]),prioriProbs[i])+catProbs[i]
        if (logL > maxLogL):
            maxLogL = logL
            maxIdx = i
    if(maxIdx == twenty_train.target[n_art]):
        predictionOk += 1
print("Accuracy Train: " + str(predictionOk/N_arts))

Accuracy Train: 0.8989747215838784


### Prueba con set de Test.

In [104]:
#Preprocesamiento del set de Test.
filteredTestArts = list()
for i in range(len(twenty_test.data)):
    if (i%100 == 0):
        print(str(i))
    tokenized = word_tokenize(twenty_test.data[i])
    lemmatized = [lemmatizer.lemmatize(tok) for tok in tokenized]
    stop = [lem for lem in lemmatized if lem not in stopwords.words('english')]
    stemmed=[stemmer.stem(w) for w in stop]
    alpha = [st for st in stemmed if st.isalpha()]
    filteredTestArts.append(" ".join(alpha))

0


KeyboardInterrupt: 

In [74]:
#Se guarda el resultado del preprocesamiento en un archivo mediante la librería Pickle.
with open('art_filt_test.pkl', 'wb') as fp:
    pickle.dump(filteredTestArts, fp)


I littl confus model bonnevil I hear LE SE lse sse ssei could someon tell differ far featur perform I also curiou know book valu prefer model and much less book valu usual get In word much demand time year I hear earli summer best time buy


In [50]:
#Se lee el resultado del preprocesamiento guardado en un archivo mediante la librería Pickle.
with open('art_filt_test.pkl','rb') as fp:
    testArticlesList = pickle.load(fp)
#Se verifica que los artículos hayan sido obtenidos correctamente.
print(len(testArticlesList))
print(testArticlesList[0])

7532
I littl confus model bonnevil I hear LE SE lse sse ssei could someon tell differ far featur perform I also curiou know book valu prefer model and much less book valu usual get In word much demand time year I hear earli summer best time buy


In [51]:
# Vectorización del set de test.
testData = count_vectorizer.transform(testArticlesList)
testDataArray = testData.toarray()
print(testData.shape)

(7532, 12637)


### Obtención del accuracy del modelo.

In [52]:
predictionOk = 0
for n_art in range(len(twenty_test.target)):
    maxIdx = -1
    maxLogL = -float('inf')
    for i in range(N_cat):
        logL = np.dot((testDataArray[n_art]),prioriProbs[i])+catProbs[i]
        if (logL > maxLogL):
            maxLogL = logL
            maxIdx = i
    if(maxIdx == twenty_test.target[n_art]):
        predictionOk += 1
print("Accuracy Test: " + str(predictionOk/len(twenty_test.target)))

Accuracy Test: 0.758364312267658
