In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/heartatack/heart.csv')

In [None]:
df

# Exploración de los datos

In [None]:
df.info()

In [None]:
columnas = df.columns.values.tolist()
len(columnas)
type(columnas.index('restecg'))

In [None]:
fig, axs = plt.subplots(2,7,figsize=(30, 13))
n=0
for columna in columnas:
    
    
    if columnas.index(columna) < 7:
        axs[0,columnas.index(columna)].boxplot(df[columna])
        axs[0,columnas.index(columna)].set_title(columna)
    else:
        
        axs[1,n].boxplot(df[columna])
        axs[1,n].set_title(columna)
        n+=1
        

plt.show()

In [None]:
plt.boxplot(df['caa'])

In [None]:
df[df['caa']>2.5]['caa'].unique()

In [None]:
df[df['trtbps']> 170 ]['trtbps'].unique()

In [None]:
plt.hist(df['trtbps'])

In [None]:
plt.hist(df['thalachh'])

In [None]:
df

In [None]:
# compruebo si hay valores nulos en alguno columna
for i in columnas:
    print(i,pd.isnull(df[i]).values.sum())


In [None]:
#df.drop(columns=['slp','caa','thall'],inplace=True)

Y = df.iloc[:,-1]
X = df.iloc[:,:-1]


In [None]:
X

## Selección de variables significativas para este modelo

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(max_iter=1000)

In [None]:
rfe = RFE(lr,10)

In [None]:
rfe = rfe.fit(X,Y)

In [None]:
rfe.support_

In [None]:
rfe.ranking_

In [None]:
ranking = list(zip(rfe.ranking_,columnas))
ranking.sort()



In [None]:
ranking

## Implementacion del modelo

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X)
scaler

In [None]:
scaler.scale_

In [None]:
x = scaler.transform(X)
x

In [None]:
y = Y.values

In [None]:
model = LogisticRegression()
model.fit(x,y)

In [None]:
model.score(x,y)

In [None]:
y.mean()

In [None]:
model.coef_

In [None]:
pd.DataFrame(list(zip(X.columns,np.transpose(model.coef_))),columns=[['Variable','Probabilidad']])

In [None]:
# validacion del modelo


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
X_train,x_test,Y_train,y_test = train_test_split(x,y,test_size=0.30, random_state=0)

In [None]:
model = LogisticRegression()


In [None]:
model.fit(X_train,Y_train)

In [None]:
prediccion = model.predict(x_test)

In [None]:
confussion_matrix = metrics.confusion_matrix(y_test,prediccion)

In [None]:
confussion_matrix

In [None]:
mc = pd.DataFrame(confussion_matrix)

In [None]:
import seaborn as sns

sns.heatmap(mc,annot=True)

In [None]:
metrics.accuracy_score(y_test,prediccion) # Eficiencia de la prediccion

In [None]:
model.score(x_test,y_test)

## Validacion cruzada para el modelo


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import linear_model

In [None]:
scores = cross_val_score(linear_model.LogisticRegression(),x,y,scoring='accuracy',cv=8)

In [None]:
scores

In [None]:
scores.mean()

## Curva ROC del modelo

In [None]:

confussion_matriz = pd.DataFrame(columns=['No_sufre_ataque','Sufre_ataque'],index=['No_sufre_ataque','Sufre_ataque'],data=mc.values)
print('verdadero negativo:', confussion_matrix[0,0])
print('Falso negativo o error tipo 2:', confussion_matrix[1,0])
print('Verdadero Positivo:', confussion_matrix[1,1])
print('Falso Positivo o error tipo 1:', confussion_matrix[0,1])
confussion_matriz

In [None]:

sensibilidad = 42/(42+5)
especifidad = 32/(32+12)

In [None]:
sensibilidad

In [None]:
especifidad

In [None]:
probs = model.predict_proba(x_test)
prob = probs[:,1] 
#probs  
# La segunda columna es la probabilidad calculada por el algoritmo de sklearn
# Calculada con el metodo de maxima versomilititud
# Como son valores entre 0 y 1 el algoritmo por defecto tiene un threshold de 0.5, entonces si p>=0.5 la prediccion es 1 y 0 si p<0.5

In [None]:
# Cuantas predicciones son 1 o 0 calculadas por el algoritmo con un threshold por defecto (0.5)
prediction = model.predict(x_test)
 
pd.crosstab(prediction,columns='counts')

In [None]:
# Se puede modificar el threshold de la probabilidad en este caso a un 0.70
# Por que hago eso? 
# Para
prediction2 = pd.DataFrame(prob)
prediction2['Predict'] = np.where(prediction2[0]>0.70,1,0)

pd.crosstab(prediction2.Predict,columns='counts')  

In [None]:
print(sensibilidad,especifidad)

In [None]:
pd.crosstab(prediction,y_test,colnames=['Prediccion'],rownames=['valor_real'])

In [None]:
especifidad_1,sensibilidad, ñ = metrics.roc_curve(y_test,prob) 

In [None]:
sensibilidad

In [None]:
especifidad_1

In [None]:
area_bajo_curva = metrics.auc(especifidad_1,sensibilidad)
area_bajo_curva

fig = plt.figure(figsize=(8,5))
plt.plot(especifidad_1,sensibilidad,marker='o',linestyle='--',color='r')
x =[i*0.01 for i in range(100)]
y = [i*0.01 for i in range(100)]
plt.plot(x,y)
plt.fill_between(especifidad_1,sensibilidad,
                where = (sensibilidad>0) & (sensibilidad<=8),
                color='grey',alpha=0.7)
plt.text(0.3,0.2,f'AreaBajoCurva: {area_bajo_curva}')

plt.title('Curva ROC')
plt.xlabel('1-sensibilida')
plt.ylabel('Especifidad')

In [None]:

area_bajo_curva = metrics.auc(especifidad_1,sensibilidad)
area_bajo_curva

In [None]:
model.predict(x_test)