In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score # métrica de evaluación
from sklearn.metrics import classification_report
from sklearn import metrics

In [None]:
df_nasa = pd.read_csv('https://raw.githubusercontent.com/pokengineer/DataScience/main/datasets/asteroids_nasa.csv')
df_nasa.head(5)

Salteo el análisis ya que es un dataset que conocemos
# Preprocesamiento de datos

In [None]:
# dropeo ['Orbiting Body','Equinox'] porque en todos los casos es earth y J2000
df_nasa.drop(['Orbiting Body','Equinox'],axis=1, inplace=True)

In [None]:
def preparacion_de_datos(df_input, target, escalar_valores=False):
  flag_que_hay_que_borrar = False # esto no sirve de nada, hay que borrarlo
  df = df_input.drop(['Close Approach Date','Orbit Determination Date'],axis=1)
  if escalar_valores and flag_que_hay_que_borrar:
    scaler_X = StandardScaler(with_mean=True, with_std=True)
    scaler_X.fit(df.drop(target,axis=1))
    x = pd.DataFrame(scaler_X.transform(df.drop(target,axis=1),), columns = df.drop(target,axis=1).columns )
  else:
    x = df.drop(target,axis=1)
  y = df[target]
  return x,y

In [None]:
X_nasa, y_nasa = preparacion_de_datos( df_nasa, "Hazardous" ,escalar_valores=False)

In [None]:
# Hacemos el Split 70-30 para train-test
X_train, X_test, y_train, y_test = train_test_split(X_nasa, y_nasa,
                                                    test_size=0.3, stratify = y_nasa, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression

# Creamos y entrenamos el modelo
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred_lg = logreg.predict(X_test)

#Exactitud del modelo
print('Exactitud (accuracy) del modelo: {:.2f} %'.format(accuracy_score(y_test, y_pred_lg)*100))
print("-"*100)

# Reporte del clasificador
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_lg))

# Comparamos por curva ROC los modelos

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

treeclf = DecisionTreeClassifier(max_depth=10, random_state=1)
treeclf.fit(X_train, y_train)
y_pred_tc = treeclf.predict(X_test)

bayes_multi = MultinomialNB()
bayes_multi.fit(X_train, y_train)
y_pred_nb = bayes_multi.predict(X_test)

bayes_gauss = GaussianNB()
bayes_gauss.fit(X_train, y_train)
y_pred_gauss = bayes_gauss.predict(X_test)


knn = DecisionTreeClassifier(max_depth=10, random_state=1)
knn.fit(X_train, y_train)
y_pred_knn = treeclf.predict(X_test)

In [None]:
def graficarCurvaRoc( y_pred ):
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)
  auc = metrics.roc_auc_score(y_test, y_pred)
  # Graficamos
  plt.plot(fpr,tpr) #,label= "AUC="+str(auc))
  plt.legend(loc=4, fontsize=12)
  return auc

# Inicializamos los labels del gráfico
plt.figure(figsize=(20, 10))
plt.xlabel('% Not Hazardous', fontsize=14)
plt.ylabel('% Hazardous', fontsize=14)

# Graficamos la recta del azar
it = [i/100 for i in range(100)]
plt.plot(it,it,label="AZAR, AUC=0.5",color="black")

for pred in [ y_pred_nb, y_pred_gauss, y_pred_tc , y_pred_lg, y_pred_knn ]:
    auc = graficarCurvaRoc( pred )

# Agregamos el titulo y configuro el tamaño de letra
plt.title("Curva ROC", fontsize=14)
plt.tick_params(labelsize=12);
plt.show()

# Mejoramos el modelo elegido con GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = { ____ }
clf = GridSearchCV( _____ , parameters)
clf.fit(X_train, y_train)

# Ejercicio
- Finalizar la funcion de preprocesamiento de datos para escalar los parametros. probar RegLog con el dataset escalado.
- Completar la sección del código dedicada a GridSearchCV, comparar el mejor modelo con los otros modelos.