# Previsión de personas con diabetes

In [9]:
import warnings
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_fscore_support as score, precision_score, recall_score, f1_score

[Pima Indians Dataset - Kaggle](https://www.kaggle.com/kumargh/pimaindiansdiabetescsv)

Este conjunto de datos describe los registros médicos de los indios Pima y si cada paciente tendrá o no un inicio de diabetes dentro de los cinco años.

La descripción de los campos sigue:
- preg = Número de veces embarazada
- plas = Concentración de glucosa en plasma a 2 horas en una prueba de tolerancia a la glucosa oral
- pres = presión arterial diastólica (mm Hg)
- piel = grosor del pliegue cutáneo del tríceps (mm)
- prueba = insulina sérica de 2 horas (mu U / ml)
- masa = índice de masa corporal (peso en kg / (altura en m) ^ 2)
- pedi = función del pedigrí de la diabetes
- age = Edad (años)
- class = Variable de clase (1: dio positivo en la prueba de diabetes, 0: dio negativo en la prueba de diabetes)

In [11]:
warnings.filterwarnings('ignore')

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
dataframe = pandas.read_csv(url)
dat = dataframe.values

In [40]:
df= pd.DataFrame(dat, columns = ['preg','plas','pres','piel','prueba','masa','pedi','age','class'])
df.head()

Unnamed: 0,preg,plas,pres,piel,prueba,masa,pedi,age,class
0,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
1,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
2,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
3,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0
4,5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0,0.0


In [32]:
df.shape

(767, 9)

In [12]:
X = dat[:,:-1]
y = dat[:,-1]

In [None]:
test_size = 0.33
seed = 7

In [4]:
model = LogisticRegression()
#split data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [49]:
y_test[1]

1.0

In [61]:
persona1 = pd.DataFrame(X_test[1])
persona1.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,179.0,90.0,27.0,0.0,44.1,0.686,23.0


In [62]:
model.predict(persona1.transpose())

array([1.])

In [52]:
persona1 = df.iloc[764:765,:-1]
real = df.iloc[764:765,:]
print(real)
model.predict(persona1)

     preg   plas  pres  piel  prueba  masa   pedi   age  class
764   5.0  121.0  72.0  23.0   112.0  26.2  0.245  30.0    0.0


array([0.])

In [64]:
precision = precision_score(y_test, pred)
print('Precision: %.2f%%' % (precision*100.0))
# recall: tp / (tp + fn)
recall = recall_score(y_test, pred)
print('Recall: %.2f%%' % (recall*100.0))
# f1: tp / (tp + fp + fn)
f1 = f1_score(y_test, pred)
print('F1 score: %.2f%%' % (f1*100.0))

Precision: 70.15%
Recall: 55.29%
F1 score: 61.84%
