# Algoritmo Multinomial Naive Bayes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
import joblib

## Creación del modelo

Cargar los datos

In [2]:
datos = pd.read_csv('train_Data.csv', sep=',', encoding = 'utf-8')
datos.drop(["Unnamed: 0", "medical_abstracts"], axis = 1)

Unnamed: 0,problems_described,words
0,1,ov myxom cas longterm followup cas socalled ov...
1,1,intraperiton yttrium90labeled monoclon antibod...
2,1,chem carcinogenes many rod carcinog admin chem...
3,1,comparison frtl5 cel grow vitro xenotranspl ce...
4,1,invas lobul carcinom mammograph find 10year ex...
...,...,...
5269,5,stress sud dea cas long qt syndrom idiopath lo...
5270,5,acut detery ren funct assocy ent hyperoxalur e...
5271,5,cathet coron artery bypass graft descend aort ...
5272,5,electrocardiogram chronic obstruct pulmon dise...


Separamos los datos entre palabras y categoría

In [3]:
X_train = datos["words"]
Y_train = datos["problems_described"]

Creamos el modelo

In [4]:
modelo = Pipeline(steps = [
    ("tf_id_vec",TfidfVectorizer()),
    ("multi_nv",MultinomialNB())
])
modelo

Pipeline(steps=[('tf_id_vec', TfidfVectorizer()),
                ('multi_nv', MultinomialNB())])

Creamos el rango para buscar los mejores hiperparámetros de este algoritmo

In [5]:
param_grid = {"multi_nv__alpha": np.arange(0, 1, 0.05), "multi_nv__fit_prior": (True, False), "tf_id_vec__ngram_range":[(1,1),(1,2),(1,3)]}

In [6]:
grid_selector = GridSearchCV(estimator = modelo, param_grid = param_grid, cv=10, n_jobs=-1)

In [None]:
grid_selector = grid_selector.fit(X_train,Y_train)

In [None]:
final_model = grid_selector.best_estimator_
final_model

In [None]:
print(final_model.predict(["A new technique of surgical treatment of chronic duodenal ulcer without laparotomy by videocoelioscopy. We performed truncal posterior right vagotomy with lesser curve anterior gastric myotomy by videocoelioscopy on 10 patients (5 men and 5 women, ranging in age from 19 and 54 years, with a mean age of 32 years). All had a long history of chronic duodenal ulcer with a mean duration of symptoms of 3.8 years. The mean length of the operation was 60 minutes (range: 55 to 110 minutes). There was no morbidity, and all patients were discharged after 5 days. The acid secretion tests under basal conditions and under insulin stimulation preoperatively and 1 month postoperatively showed a mean decrease in the basal output of 79.3% and a mean decrease of 83.04% in the maximal output. The fibroscopic control at the second postoperative month showed a complete healing of the ulcer in nine patients and a residual ulcer scar in one. No patients had any abdominal complaints. Right truncal vagotomy and anterior lesser curve seromyotomy by videocoelioscopy is an efficient and elegant method of treating chronic duodenal ulcer, but it needs thorough experimental practice"]))

## Guardar Modelo

In [None]:
file = open("multinomial_nb.joblib","wb")
joblib.dump(final_model,file)
file.close()