In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder


# Adquirir los datos

In [None]:
df_train_ = pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster/train.csv')
df_test_ = pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster/test.csv')

In [None]:
df_train_.head()

**Data Dictionary**

* Variable	Definition	Key
* survival	Survival	0 = No, 1 = Yes
* pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
* sex	Sex	
* Age	Age in years	
* sibsp	# of siblings / spouses aboard the Titanic	
* parch	# of parents / children aboard the Titanic	
* ticket	Ticket number	
* fare	Passenger fare	
* cabin	Cabin number	
* embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

Variable Notes

* pclass: A proxy for socio-economic status (SES)
    1st = Upper
    2nd = Middle
    3rd = Lower
* age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
* sibsp: The dataset defines family relations in this way...
    * Sibling = brother, sister, stepbrother, stepsister
    * Spouse = husband, wife (mistresses and fiancés were ignored)
* parch: The dataset defines family relations in this way...
    * Parent = mother, father
    * Child = daughter, son, stepdaughter, stepson
    * Some children travelled only with a nanny, therefore parch=0 for them.

# Análisis exploratorio

In [None]:
df_train_.info()

In [None]:
# Hay duplicados?

df_train.duplicated().sum()

In [None]:
# Valores únicos. Nos permiten intuir cuales serán caracteristicas o no

df_train_.nunique()

In [None]:
columnas_categoricas = df_train_.select_dtypes(include=['object']).columns

for c in columnas_categoricas:
    if df_train_[c].nunique()<=10:
        print(f"{c}: {df_train_[c].unique()}")

In [None]:
columnas_numericas = df_train_.select_dtypes(include=['int64', 'float64']).columns

for c in columnas_numericas:
    if df_train_[c].nunique()<=10:
        print(f"{c}: {df_train_[c].unique()}")

In [None]:
sns.countplot(x='Survived', data=df_train_)

In [None]:
sns.barplot(x='Sex', y='Survived', data=df_train_)

In [None]:
df_train_.columns

# Selección de características

In [None]:
# Selección de caracteristicas

df_train = df_train_.drop(['PassengerId','Name', 'Ticket', 'Fare', 'Cabin'], axis=1)

In [None]:
df_train.head()

# Codificación de variables categóricas e imputación de valores a nulos

In [None]:
#  Codificar variables categóricas

columnas_categoricas = ['Sex', 'Embarked']

# Usamos la libreria sklearn

ordinal_encoder = OrdinalEncoder()

df_train[columnas_categoricas] = ordinal_encoder.fit_transform(df_train[columnas_categoricas])

In [None]:
df_train.head()

In [None]:
# Imputar nulos a elementos vacios usando sklearn
# La estrategia por defecto de imputación es la media o el más frecuente en variables categoricas
imputer = SimpleImputer()

df_train_final = pd.DataFrame(imputer.fit_transform(df_train))

df_train_final.columns = df_train.columns

In [None]:
df_train.info()

In [None]:
df_train_final.head()

In [None]:
df_train_final['Embarked'].unique()

In [None]:
# Realmente habría que haber hecho esto....

from sklearn.compose import ColumnTransformer 

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_imputer, ['Age']),  # Aplicar la media a las columnas numéricas
        ('cat', cat_imputer, ['Embarked'])  # Aplicar la moda a las columnas categóricas
    ]
)

df_cols_imputed = pd.DataFrame(preprocessor.fit_transform(df_train))

df_cols_imputed.columns = ['Age', 'Embarked']

df_train_final = pd.concat([df_train[['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch']], df_cols_imputed], axis=1)

# Modelo y entrenamiento

In [None]:
#  El modelo....

model = RandomForestClassifier()

In [None]:
# División del dataset

X_train = df_train_final.drop('Survived', axis=1)
y_train = df_train_final[['Survived']]


In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
#  Entrenamiento

model.fit(X_train, y_train)

# Predicciones

In [None]:
df_test_

In [None]:
#  Primero transformamos df_test como hemos hecho con df_train

#  Selección de caracteristicas

df_test = df_test_.drop(['PassengerId','Name', 'Ticket', 'Fare', 'Cabin'], axis=1)

# Codificación

df_test[columnas_categoricas] = ordinal_encoder.fit_transform(df_test[columnas_categoricas])

# Valores nulos

df_cols_imputed = pd.DataFrame(preprocessor.fit_transform(df_test))

df_cols_imputed.columns = ['Age', 'Embarked']

df_test_final = pd.concat([df_test[['Pclass', 'Sex', 'SibSp', 'Parch']], df_cols_imputed], axis=1)

# Division de dataset

X_test = df_test_final.copy(deep=True)


In [None]:
predictions = model.predict(X_test)

In [None]:
predictions