# Primer modelo usando SKLearn

## Importamos las librerías

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

## Importamos los datos con pandas

In [2]:
# Cargando los datos
datos_titanic = pd.read_csv("titanic_train.csv")

In [3]:
# Cargando los datos
entrenamiento, pruebas = train_test_split(datos_titanic, test_size=0.3)

In [4]:
entrenamiento.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,623.0,623.0,623.0,501.0,623.0,623.0,623.0
mean,444.428571,0.378812,2.292135,29.508483,0.544141,0.4061,33.64826
std,254.020287,0.485481,0.849467,14.340673,1.120215,0.814121,50.945778
min,2.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,222.5,0.0,1.0,21.0,0.0,0.0,7.925
50%,446.0,0.0,3.0,28.0,0.0,0.0,15.2458
75%,657.5,1.0,3.0,39.0,1.0,1.0,31.3875
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
entrenamiento.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
497,498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S
53,54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkin...",female,29.0,1,0,2926,26.0,,S
506,507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33.0,0,2,26360,26.0,,S
486,487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35.0,1,0,19943,90.0,C93,S
596,597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33.0,,S


## Hacemos una "limpieza" de nuestro datos antes de hacer el modelo

In [6]:
combine = [entrenamiento, pruebas]

In [7]:
# Convertir valores en texto 'male' y 'female' a valores numéricos (0 y 1)
sex_mapping = {'male': 0, 'female': 1}
entrenamiento['Sex'] = entrenamiento["Sex"].map(sex_mapping)
pruebas['Sex'] = pruebas["Sex"].map(sex_mapping) #Mapear (con map) la columna Sex para la base de pruebas.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [8]:
calculo_edades = np.zeros((2,3))

In [9]:
for dataset in combine:
    for sex in range(0, 2):
        for pclass in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == sex) & (dataset['Pclass'] == pclass+1)]['Age'].dropna()
            age_guess = guess_df.median()
            calculo_edades[sex, pclass] = int(age_guess/0.5 + 0.5) * 0.5
    
    for sex in range(0, 2):
        for pclass in range(0, 3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == sex) &(dataset.Pclass == pclass+1),'Age'] = calculo_edades[sex, pclass]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [11]:
entrenamiento = entrenamiento.drop(['Ticket', 'Cabin', 'Name', 'PassengerId', 'SibSp', 'Parch', 'Embarked'], axis=1)
pruebas = pruebas.drop(['Ticket', 'Cabin',"PassengerId", 'Name', 'SibSp', 'Parch', 'Embarked'], axis=1)

Y_train = entrenamiento['Survived']
X_train = entrenamiento.drop('Survived', axis=1)
X_test  = pruebas.drop("Survived", axis=1)

## Creamos y entrenamos nuestro modelo

In [12]:
decision_tree = DecisionTreeClassifier()


In [13]:
#Entrenar el árbol con los datos de entrenamiento (x, y)
decision_tree.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

## Predecimos con nuestro árbol y la tasa de exactitud

In [14]:
Y_pred = decision_tree.predict(X_test)

In [15]:
decision_tree.score(X_train, Y_train)

0.9839486356340289