# Primer modelo automático usando TPOT

## Importamos las librerías

In [2]:
import numpy as np
import pandas as pd
from tpot import TPOTClassifier

from sklearn.model_selection import train_test_split

## Importamos los datos con pandas

In [4]:
# Cargando los datos
datos_titanic = pd.read_csv("titanic_train.csv")
entrenamiento, pruebas = train_test_split(datos_titanic ,test_size=0.3)

In [5]:
entrenamiento.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,623.0,623.0,623.0,501.0,623.0,623.0,623.0
mean,445.125201,0.394864,2.29053,29.686966,0.550562,0.388443,34.416311
std,261.896055,0.489214,0.845276,14.800772,1.128523,0.793864,54.193837
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,208.5,0.0,1.0,20.0,0.0,0.0,7.925
50%,443.0,0.0,3.0,28.0,0.0,0.0,15.5
75%,680.5,1.0,3.0,38.0,1.0,0.0,31.3875
max,891.0,1.0,3.0,80.0,8.0,5.0,512.3292


In [6]:
entrenamiento.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
730,731,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
456,457,0,1,"Millet, Mr. Francis Davis",male,65.0,0,0,13509,26.55,E38,S
359,360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.225,,C
768,769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q


## Hacemos una "limpieza" de nuestro datos antes de hacer el modelo

In [7]:
combine = [entrenamiento, pruebas]

In [8]:
# Convertir valores en texto a números (0 y 1)
sex_mapping = {'male': 0, 'female': 1}
entrenamiento['Sex'] = entrenamiento['Sex'].map(sex_mapping)
pruebas['Sex'] = pruebas['Sex'].map(sex_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [9]:
calculo_edades = np.zeros((2,3))

In [10]:
for dataset in combine:
    for sex in range(0, 2):
        for pclass in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == sex) & (dataset['Pclass'] == pclass+1)]['Age'].dropna()
            age_guess = guess_df.median()
            calculo_edades[sex, pclass] = int(age_guess/0.5 + 0.5) * 0.5
    
    for sex in range(0, 2):
        for pclass in range(0, 3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == sex) &(dataset.Pclass == pclass+1),'Age'] = calculo_edades[sex, pclass]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [11]:
entrenamiento = entrenamiento.drop(['Ticket', 'Cabin', 'Name', 'PassengerId', 'SibSp', 'Parch', 'Embarked'], axis=1)
pruebas = pruebas.drop(['Ticket', 'Cabin', 'Name', 'SibSp', 'Parch', 'Embarked'], axis=1)

X_train = entrenamiento.drop('Survived', axis=1)
Y_train = entrenamiento['Survived']
X_test  = pruebas.drop(["PassengerId","Survived"], axis=1)

## Creamos y entrenamos nuestro modelo

In [12]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=2) #utiliza la clase TPOTClassifier asignándole algunos minutos al proceso

In [13]:
#Entrena el clasificador TPOT con las bases 
tpot.fit(X_train, Y_train)



HBox(children=(IntProgress(value=0, description='Optimization Progress', style=ProgressStyle(description_width…

Generation 1 - Current best internal CV score: 0.8218322580645161
Generation 2 - Current best internal CV score: 0.8298322580645161

2.09 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.45, min_samples_leaf=1, min_samples_split=8, n_estimators=100)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=100,
               max_eval_time_mins=5, max_time_mins=2, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=100,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [None]:
#importa las librerías necesarias para el mejor modelo descubierto

In [None]:
nuevo_modelo = #Genera tu modelo con la información retornada por TPOT

In [None]:
nuevo_modelo.fit(X_train, Y_train)

## Predecimos con nuestro árbol y la tasa de exactitud

In [None]:
Y_pred = nuevo_modelo.predict(X_test)

In [None]:
Y_pred

In [None]:
#Retornar el score