In [None]:
#Importa bibliotecas
import numpy as np
import pandas as pd
import random as rnd
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

#MODELOS Y METRICAS
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
#Define funciones de estadistica descriptiva
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()
    
def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = train_df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )
    
def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))


In [None]:
#Importa los datos
train_df = pd.read_csv('../input/titanic-machine-learning-from-disaster/train.csv')
test_df = pd.read_csv('../input/titanic-machine-learning-from-disaster/test.csv')

combine = [train_df, test_df]

dataSet_df = pd.concat(combine, sort = False)

In [None]:
print(dataSet_df.columns.values)

In [None]:
#Tabla con los primeros registros
dataSet_df.head()

In [None]:
#resumen del dataset del número y tipo de variables
dataSet_df.info()

In [None]:
#descripcion del dataset de variables numéricas
dataSet_df.describe()

In [None]:
#descripcion del dataset de variables categoricas
dataSet_df.describe(include=['O'])

In [None]:
#grafico de correlaciones entre variables
plot_correlation_map( train_df )

In [None]:
#grafico de distribucion de la tarifa pagada vs sobrevivencia separado por sexo
plot_distribution( dataSet_df , var = 'Fare' , target = 'Survived' , row = 'Sex' )

In [None]:
#grafico de distribucion de edad vs sobrevivencia separado por sexo
plot_distribution( dataSet_df , var = 'Age' , target = 'Survived' , row = 'Sex' )

In [None]:
#grafico de lugar de embarque vs sobrevivencia
plot_categories( dataSet_df , cat = 'Embarked' , target = 'Survived' )

In [None]:
#grafico de sexo vs sobrevivencia
plot_categories( dataSet_df , cat = 'Sex' , target = 'Survived' )

In [None]:
#grafico de clase vs sobrevivencia
plot_categories( dataSet_df , cat = 'Pclass' , target = 'Survived' )

In [None]:
#grafico de numero de hermanos/esposo vs sobrevivencia
plot_categories( dataSet_df , cat = 'SibSp' , target = 'Survived' )

In [None]:
#grafica del numero de hermanos/padres vs sobrevivencia
plot_categories( dataSet_df , cat = 'Parch' , target = 'Survived' )

In [None]:
#Crea variables dummies
pclass     = pd.get_dummies( dataSet_df.Pclass , prefix='Pclass' )

sex        = pd.DataFrame(dataSet_df.Sex.map( {'male':0, 'female':1} ).astype(int),columns=['Sex'])

SibSp_size = dataSet_df.SibSp.map( {0:'small', 1:'small', 2:'small', 3:'mid', 4:'mid', 5:'mid', 6:'big', 7:'big', 8:'big'} ).astype(str)
Parch_size = dataSet_df.Parch.map( {0:'small', 1:'small', 2:'small', 3:'mid', 4:'mid', 5:'mid', 6:'big', 7:'big', 8:'big', 9:'big'} ).astype(str)
sibsp      = pd.get_dummies( SibSp_size , prefix='SibSp' )
parch      = pd.get_dummies( Parch_size , prefix='Parch' )

embarked   = pd.get_dummies( dataSet_df.Embarked , prefix='Embarked' )

In [None]:
#Crea la variable title parseando titulos del nombre
title = pd.DataFrame()

title[ 'Title' ] = dataSet_df[ 'Name' ].map( lambda name: name.split( ',' )[1].split( '.' )[0].strip() )


Title_Dictionary = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"

                    }

title[ 'Title' ] = title.Title.map( Title_Dictionary )
title = pd.get_dummies( title.Title )


In [None]:
#limpia los registros de la variable ticket
def cleanTicket( ticket ):
    ticket = ticket.replace( '.' , '' )
    ticket = ticket.replace( '/' , '' )
    ticket = ticket.split()
    ticket = map( lambda t : t.strip() , ticket )
    ticket = list(filter( lambda t : not t.isdigit() , ticket ))
    if len( ticket ) > 0:
        return ticket[0]
    else: 
        return 'XXX'

ticket = pd.DataFrame()

ticket[ 'Ticket' ] = dataSet_df[ 'Ticket' ].map( cleanTicket )
ticket = pd.get_dummies( ticket[ 'Ticket' ] , prefix = 'Ticket' )

In [None]:
#crea una variable dummy para el tipo de cabina
cabin = pd.DataFrame()

cabin[ 'Cabin' ] = dataSet_df.Cabin.fillna( 'U' )
cabin[ 'Cabin' ] = cabin[ 'Cabin' ].map( lambda c : c[0] )
cabin = pd.get_dummies( cabin['Cabin'] , prefix = 'Cabin' )

In [None]:
#Crea la variable tamaño de familia
family_size = dataSet_df.SibSp + dataSet_df.Parch
family_size = family_size.map( {0:'single', 1:'small', 2:'small', 3:'small', 4:'mid', 5:'mid', 6:'mid', 7:'mid', 8:'big', 9:'big', 10:'big'} ).astype(str)

family = pd.get_dummies(family_size, prefix = 'Family')

In [None]:
#Crea variables ordinales para las variables edad y tarifa
age  = pd.DataFrame(dataSet_df.Age.where(dataSet_df.Age >= 0, dataSet_df.Age.mean() + (rnd.random()*1.2 - 0.6)*dataSet_df.Age.std()),columns =['Age'])
fare = pd.DataFrame(dataSet_df.Fare.where(dataSet_df.Fare >= 0, dataSet_df.Fare.mean() + (rnd.random()*1.2 - 0.6)*dataSet_df.Fare.std()),columns=['Fare'])

In [None]:
age = pd.DataFrame()

age[ 'Age_child' ] = dataSet_df.Age.map( lambda s : 1 if       s < 15  else 0 )
age[ 'Age_young' ] = dataSet_df.Age.map( lambda s : 1 if 15 <= s < 25  else 0 )
age[ 'Age_grown' ] = dataSet_df.Age.map( lambda s : 1 if 25 <= s < 35  else 0 )
age[ 'Age_mature'] = dataSet_df.Age.map( lambda s : 1 if 35 <= s       else 0 )

In [None]:
fare = pd.DataFrame()

# introducing other features based on the family size
fare[ 'Fare_low'      ] = dataSet_df.Fare.map( lambda s : 1 if         s < 8     else 0 )
fare[ 'Fare_moderate' ] = dataSet_df.Fare.map( lambda s : 1 if 8    <= s < 14.5  else 0 )
fare[ 'Fare_high'     ] = dataSet_df.Fare.map( lambda s : 1 if 14.5 <= s < 31.3  else 0 )
fare[ 'Fare_veryhigh']  = dataSet_df.Fare.map( lambda s : 1 if 31.3 <= s         else 0 )

In [None]:
#crea una tabla con todas las variables incluyendo las que acabamos de crear
full_X = pd.concat( [ title, sex, age, pclass, fare, cabin, embarked, ticket, sibsp, parch, family] , axis=1 )

In [None]:
#normaliza las variables para que los modelos las puedan representar mas facilmente
normalized = pd.DataFrame()
for col in full_X.columns:
    normalized[col] = (full_X[col] - full_X[col].mean())/full_X[col].std()

In [None]:
#crea dos tablas mas consolidando las nuevas variables y las variables normalizadas
trainSet_X = full_X[:891]
trainSet_y = pd.DataFrame(train_df.Survived,columns=['Survived'])
testSet_X  = full_X[891:]

normtrainSet_X = normalized[:891]
normtestSet_X  = normalized[891:]

In [None]:
train_set = pd.concat([trainSet_X,trainSet_y], axis=1, sort=False)

In [None]:
#Graficos de edades separados por sexo para sobrevivientes y no sobrevientes
c_m = train_set.Survived.where(train_set.Age_child == 1).where(train_set.Sex == 0).value_counts()
c_f = train_set.Survived.where(train_set.Age_child == 1).where(train_set.Sex == 1).value_counts()
y_m = train_set.Survived.where(train_set.Age_young == 1).where(train_set.Sex == 0).value_counts()
y_f = train_set.Survived.where(train_set.Age_young == 1).where(train_set.Sex == 1).value_counts()
g_m = train_set.Survived.where(train_set.Age_grown == 1).where(train_set.Sex == 0).value_counts()
g_f = train_set.Survived.where(train_set.Age_grown == 1).where(train_set.Sex == 1).value_counts()
m_m = train_set.Survived.where(train_set.Age_mature == 1).where(train_set.Sex == 0).value_counts()
m_f = train_set.Survived.where(train_set.Age_mature == 1).where(train_set.Sex == 1).value_counts()

male_av    = [c_m.iat[0], y_m.iat[1], g_m.iat[1], m_m.iat[1]]
female_av  = [c_f.iat[0], y_f.iat[0], g_f.iat[0], m_f.iat[0]]
male_am    = [c_m.iat[1], y_m.iat[0], g_m.iat[0], m_m.iat[0]]
female_am  = [c_f.iat[1], y_f.iat[1], g_f.iat[1], m_f.iat[1]]

ageStage = ['child','young','grown','mature']
age_vive_df = pd.DataFrame({'male': male_av,'female ': female_av }, index=ageStage)
age_vive_df.plot.bar(rot=0, stacked=False, subplots=False,layout=[1,2], title='Sobrevivientes',colormap='Paired')

age_muere_df = pd.DataFrame({'male': male_am,'female ': female_am }, index=ageStage)
age_muere_df.plot.bar(rot=0, stacked=False, subplots=False,layout=[1,2], title='No Sobrevivientes',colormap='Paired')

In [None]:
#Grafico de clases separados por sexo para sobrevivientes y no sobrevientes
first_m  = train_set.Survived.where(train_set.Pclass_1 == 1).where(train_set.Sex == 0).value_counts()
first_f  = train_set.Survived.where(train_set.Pclass_1 == 1).where(train_set.Sex == 1).value_counts()
second_m = train_set.Survived.where(train_set.Pclass_2 == 1).where(train_set.Sex == 0).value_counts()
second_f = train_set.Survived.where(train_set.Pclass_2 == 1).where(train_set.Sex == 1).value_counts()
third_m  = train_set.Survived.where(train_set.Pclass_3 == 1).where(train_set.Sex == 0).value_counts()
third_f  = train_set.Survived.where(train_set.Pclass_3 == 1).where(train_set.Sex == 1).value_counts()

male_cv    = [first_m.iat[1], second_m.iat[1], third_m.iat[1]]
female_cv  = [first_f.iat[0], second_f.iat[0], third_f.iat[1]]
male_cm    = [first_m.iat[0], second_m.iat[0], third_m.iat[0]]
female_cm  = [first_f.iat[1], second_f.iat[1], third_f.iat[0]]

P_class = ['first','second','third']
class_vive_df = pd.DataFrame({'male': male_cv,'female ': female_cv }, index=P_class)
class_vive_df.plot.bar(rot=0, stacked=False, title='Sobrevivientes',colormap='Paired')

class_muere_df = pd.DataFrame({'male': male_cm,'female ': female_cm }, index=P_class)
class_muere_df.plot.bar(rot=0, stacked=False, title='No Sobrevivientes',colormap='Paired')

In [None]:
#Grafico del tamaño de familia separados por sex para sobrevievientes y no sobrevivientes
single_m = train_set.Survived.where(train_set.Family_single == 1).where(train_set.Sex == 0).value_counts()
single_f = train_set.Survived.where(train_set.Family_single == 1).where(train_set.Sex == 1).value_counts()
small_m  = train_set.Survived.where(train_set.Family_small == 1).where(train_set.Sex == 0).value_counts()
small_f  = train_set.Survived.where(train_set.Family_small == 1).where(train_set.Sex == 1).value_counts()
mid_m    = train_set.Survived.where(train_set.Family_mid == 1).where(train_set.Sex == 0).value_counts()
mid_f    = train_set.Survived.where(train_set.Family_mid == 1).where(train_set.Sex == 1).value_counts()
big_m    = train_set.Survived.where(train_set.Family_big == 1).where(train_set.Sex == 0).value_counts()
big_f    = train_set.Survived.where(train_set.Family_big == 1).where(train_set.Sex == 1).value_counts()


male_fv    = [single_m.iat[1], small_m.iat[1], mid_m.iat[1], 0]
female_fv  = [single_f.iat[0], small_f.iat[0], mid_f.iat[1], 0]
male_fm    = [single_m.iat[0], small_m.iat[0], mid_m.iat[0], big_m.iat[0]]
female_fm  = [single_f.iat[1], small_f.iat[1], mid_f.iat[0], big_f.iat[0]]

fam = ['single','small','mid','big']
fam_vive_df = pd.DataFrame({'male': male_fv,'female ': female_fv }, index=fam)
fam_vive_df.plot.bar(rot=0, stacked=False, title='Sobrevivientes',colormap='Paired')

fam_muere_df = pd.DataFrame({'male': male_fm,'female ': female_fm }, index=fam)
fam_muere_df.plot.bar(rot=0, stacked=False, title='No Sobrevivientes',colormap='Paired')

In [None]:
#Separa los datos en el conjunto de entrenamiento validacion y prueba
Strain_X , Svalid_X , Strain_y , Svalid_y = train_test_split( trainSet_X , trainSet_y , train_size = .8, test_size = .2   )

In [None]:
Strain_y = np.ravel(Strain_y)

In [None]:
#Hace listas para guardar datos de los modelos
models     = list()
confusion  = list()
train_eval = list()
valid_eval = list()
variance   = list()
error      = list()

In [None]:
#incluye los modelos en la lista de modelos
model = LogisticRegression()
models.append(model)

model = SVC()
models.append(model)

model= DecisionTreeClassifier(criterion='gini', 
                             min_samples_split=10,min_samples_leaf=1,
                             max_features='auto')
models.append(model)

model= GradientBoostingClassifier()

models.append(model)

model = RandomForestClassifier(criterion='gini', n_estimators=700,
                             min_samples_split=10,min_samples_leaf=1,
                             max_features='auto',oob_score=True,
                             random_state=1,n_jobs=-1)

models.append(model)

In [None]:
#entrena los modelos
for i in range(len(models)):
    models[i].fit(Strain_X, Strain_y)

In [None]:
#hace prediccion con los modelos
for i in range(len(models)):
    predicted_train = pd.DataFrame(models[i].predict(Strain_X),columns=['Prediction'],index=Strain_X.index)
    predicted_valid = pd.DataFrame(models[i].predict(Svalid_X),columns=['Prediction'],index=Svalid_X.index)

    train_score = accuracy_score(predicted_train,Strain_y)
    valid_score = accuracy_score(predicted_valid,Svalid_y)

    accuracy       = round(train_score*100,2)
    valid_accuracy = round(valid_score*100,2)
    mean_error     = round(mean_absolute_error(Svalid_y, predicted_valid), 2)

    mat = confusion_matrix(Svalid_y, predicted_valid)
    confusion.append(mat)
    
    train_eval.append(accuracy)
    valid_eval.append(valid_accuracy)
    variance.append(accuracy - valid_accuracy)
    error.append(mean_error)

In [None]:
#Hace dataframes de evaluacion y metricas de rendimiento
evaluacion_train  = pd.DataFrame(train_eval,columns=['Train Score'],index=['Logistic', 'SVM', 'Tree', 'Gradient', 'Forest'])
evaluacion_valid  = pd.DataFrame(valid_eval,columns=['Validation Score'],index=['Logistic', 'SVM', 'Tree', 'Gradient', 'Forest'])
variacion         = pd.DataFrame(variance,columns=['Variance'],index=['Logistic', 'SVM', 'Tree', 'Gradient', 'Forest'])
err               = pd.DataFrame(error,columns=['Error'],index=['Logistic', 'SVM', 'Tree', 'Gradient', 'Forest'])
performance       = pd.concat([evaluacion_train,evaluacion_valid,variacion,err],axis=1)

In [None]:
performance

In [None]:
#matriz de confusion de predicciones
sns.heatmap(confusion[1].T, square=True, annot=True, fmt='3.0f', cbar=False,
                        xticklabels=['no survive', 'survive'],
                        yticklabels=['no survive', 'survive'],cmap="cool")
plt.xlabel('observed label')
plt.ylabel('predicted label')
plt.title('Confusion matrix', y=1.05, size=15)

Modelo de regresion logistica

In [None]:

model               = LogisticRegression()
confusion_logistic  = list()
train_eval_logistic = list()
valid_eval_logistic = list()
variance_logistic   = list()
error_logistic      = list()

for i in range(1000):
    Strain_X , Svalid_X , Strain_y , Svalid_y = train_test_split( trainSet_X , trainSet_y , train_size = .8, test_size = .2   )
    Strain_y = np.ravel(Strain_y)
    model.fit(Strain_X, Strain_y)
    
    predicted_train = pd.DataFrame(model.predict(Strain_X),columns=['Prediction'],index=Strain_X.index)
    predicted_valid = pd.DataFrame(model.predict(Svalid_X),columns=['Prediction'],index=Svalid_X.index)

    train_score = accuracy_score(predicted_train,Strain_y)
    valid_score = accuracy_score(predicted_valid,Svalid_y)

    accuracy       = round(train_score*100,2)
    valid_accuracy = round(valid_score*100,2)
    mean_error     = round(mean_absolute_error(Svalid_y, predicted_valid), 2)

    mat = confusion_matrix(Svalid_y, predicted_valid)
    confusion_logistic.append(mat)
    
    train_eval_logistic.append(accuracy)
    valid_eval_logistic.append(valid_accuracy)
    variance_logistic.append(accuracy - valid_accuracy)
    error_logistic.append(mean_error)

In [None]:
evaluacion_train_logistic  = pd.DataFrame(train_eval_logistic,columns=['Train Score'])
evaluacion_valid_logistic  = pd.DataFrame(valid_eval_logistic,columns=['Validation Score'])
variacion_logistic         = pd.DataFrame(variance_logistic,columns=['Variance'])
err_logistic               = pd.DataFrame(error_logistic,columns=['Error'])
performance_logistic       = pd.concat([evaluacion_train_logistic,evaluacion_valid_logistic,variacion_logistic,err_logistic],axis=1)

In [None]:
#descrpcion de performance
performance_logistic.describe()

Modelo de support vector machine

In [None]:
model = SVC()
confusion_SVC  = list()
train_eval_SVC = list()
valid_eval_SVC = list()
variance_SVC   = list()
error_SVC      = list()

for i in range(1000):
    Strain_X , Svalid_X , Strain_y , Svalid_y = train_test_split( trainSet_X , trainSet_y , train_size = .8, test_size = .2   )
    Strain_y = np.ravel(Strain_y)
    model.fit(Strain_X, Strain_y)
    
    
    predicted_train = pd.DataFrame(model.predict(Strain_X),columns=['Prediction'],index=Strain_X.index)
    predicted_valid = pd.DataFrame(model.predict(Svalid_X),columns=['Prediction'],index=Svalid_X.index)

    train_score = accuracy_score(predicted_train,Strain_y)
    valid_score = accuracy_score(predicted_valid,Svalid_y)

    accuracy       = round(train_score*100,2)
    valid_accuracy = round(valid_score*100,2)
    mean_error     = round(mean_absolute_error(Svalid_y, predicted_valid), 2)

    mat = confusion_matrix(Svalid_y, predicted_valid)
    confusion_SVC.append(mat)
    
    train_eval_SVC.append(accuracy)
    valid_eval_SVC.append(valid_accuracy)
    variance_SVC.append(accuracy - valid_accuracy)
    error_SVC.append(mean_error)

In [None]:
evaluacion_train_SVC  = pd.DataFrame(train_eval_SVC,columns=['Train Score'])
evaluacion_valid_SVC  = pd.DataFrame(valid_eval_SVC,columns=['Validation Score'])
variacion_SVC         = pd.DataFrame(variance_SVC,columns=['Variance'])
err_SVC               = pd.DataFrame(error_SVC,columns=['Error'])
performance_SVC       = pd.concat([evaluacion_train_SVC,evaluacion_valid_SVC,variacion_SVC,err_SVC],axis=1)

In [None]:
performance_SVC.describe()

Modelo de árbol de decisión

In [None]:
model = DecisionTreeClassifier(criterion='gini', 
                             min_samples_split=10,min_samples_leaf=1,
                             max_features='auto')
confusion_DT  = list()
train_eval_DT = list()
valid_eval_DT = list()
variance_DT   = list()
error_DT     = list()

for i in range(1000):
    Strain_X , Svalid_X , Strain_y , Svalid_y = train_test_split( trainSet_X , trainSet_y , train_size = .8, test_size = .2   )
    Strain_y = np.ravel(Strain_y)
    model.fit(Strain_X, Strain_y)
    
    predicted_train = pd.DataFrame(model.predict(Strain_X),columns=['Prediction'],index=Strain_X.index)
    predicted_valid = pd.DataFrame(model.predict(Svalid_X),columns=['Prediction'],index=Svalid_X.index)

    train_score = accuracy_score(predicted_train,Strain_y)
    valid_score = accuracy_score(predicted_valid,Svalid_y)

    accuracy       = round(train_score*100,2)
    valid_accuracy = round(valid_score*100,2)
    mean_error     = round(mean_absolute_error(Svalid_y, predicted_valid), 2)

    mat = confusion_matrix(Svalid_y, predicted_valid)
    confusion_DT.append(mat)
    
    train_eval_DT.append(accuracy)
    valid_eval_DT.append(valid_accuracy)
    variance_DT.append(accuracy - valid_accuracy)
    error_DT.append(mean_error)

In [None]:
evaluacion_train_DT  = pd.DataFrame(train_eval_DT,columns=['Train Score'])
evaluacion_valid_DT  = pd.DataFrame(valid_eval_DT,columns=['Validation Score'])
variacion_DT         = pd.DataFrame(variance_DT,columns=['Variance'])
err_DT               = pd.DataFrame(error_DT,columns=['Error'])
performance_DT       = pd.concat([evaluacion_train_DT,evaluacion_valid_DT,variacion_DT,err_DT],axis=1)

In [None]:
performance.describe()

Modelo gradient boost classifier

In [None]:
model = GradientBoostingClassifier()
confusion_Gd  = list()
train_eval_Gd = list()
valid_eval_Gd = list()
variance_Gd   = list()
error_Gd      = list()

for i in range(1000):
    Strain_X , Svalid_X , Strain_y , Svalid_y = train_test_split( trainSet_X , trainSet_y , train_size = .8, test_size = .2   )
    Strain_y = np.ravel(Strain_y)
    model.fit(Strain_X, Strain_y)
    
    predicted_train = pd.DataFrame(model.predict(Strain_X),columns=['Prediction'],index=Strain_X.index)
    predicted_valid = pd.DataFrame(model.predict(Svalid_X),columns=['Prediction'],index=Svalid_X.index)

    train_score = accuracy_score(predicted_train,Strain_y)
    valid_score = accuracy_score(predicted_valid,Svalid_y)

    accuracy       = round(train_score*100,2)
    valid_accuracy = round(valid_score*100,2)
    mean_error     = round(mean_absolute_error(Svalid_y, predicted_valid), 2)

    mat = confusion_matrix(Svalid_y, predicted_valid)
    confusion_Gd.append(mat)
    
    train_eval_Gd.append(accuracy)
    valid_eval_Gd.append(valid_accuracy)
    variance_Gd.append(accuracy - valid_accuracy)
    error_Gd.append(mean_error)

In [None]:
evaluacion_train_Gd  = pd.DataFrame(train_eval_Gd,columns=['Train Score'])
evaluacion_valid_Gd  = pd.DataFrame(valid_eval_Gd,columns=['Validation Score'])
variacion_Gd         = pd.DataFrame(variance_Gd,columns=['Variance'])
err_Gd               = pd.DataFrame(error_Gd,columns=['Error'])
performance_Gd       = pd.concat([evaluacion_train_Gd,evaluacion_valid_Gd,variacion_Gd,err_Gd],axis=1)

In [None]:
performance_Gd.describe()

Modelo de bosque aleatorio

In [None]:
model = RandomForestClassifier(criterion='gini', n_estimators=50,
                             min_samples_split=10,min_samples_leaf=1,
                             max_features='auto',oob_score=True,
                             random_state=1,n_jobs=-1)
confusion_Rf  = list()
train_eval_Rf = list()
valid_eval_Rf = list()
variance_Rf   = list()
error_Rf      = list()

for i in range(1000):
    Strain_X , Svalid_X , Strain_y , Svalid_y = train_test_split( trainSet_X , trainSet_y , train_size = .8, test_size = .2   )
    Strain_y = np.ravel(Strain_y)
    model.fit(Strain_X, Strain_y)
    
    predicted_train = pd.DataFrame(model.predict(Strain_X),columns=['Prediction'],index=Strain_X.index)
    predicted_valid = pd.DataFrame(model.predict(Svalid_X),columns=['Prediction'],index=Svalid_X.index)

    train_score = accuracy_score(predicted_train,Strain_y)
    valid_score = accuracy_score(predicted_valid,Svalid_y)

    accuracy       = round(train_score*100,2)
    valid_accuracy = round(valid_score*100,2)
    mean_error     = round(mean_absolute_error(Svalid_y, predicted_valid), 2)

    mat = confusion_matrix(Svalid_y, predicted_valid)
    confusion_Rf.append(mat)
    
    train_eval_Rf.append(accuracy)
    valid_eval_Rf.append(valid_accuracy)
    variance_Rf.append(accuracy - valid_accuracy)
    error_Rf.append(mean_error)

In [None]:
evaluacion_train_Rf  = pd.DataFrame(train_eval_Rf,columns=['Train Score'])
evaluacion_valid_Rf  = pd.DataFrame(valid_eval_Rf,columns=['Validation Score'])
variacion_Rf         = pd.DataFrame(variance_Rf,columns=['Variance'])
err_Rf               = pd.DataFrame(error_Rf,columns=['Error'])
performance_Rf       = pd.concat([evaluacion_train_Rf,evaluacion_valid_Rf,variacion_Rf,err_Rf],axis=1)

In [None]:
performance_Rf.describe()

In [None]:
model = RandomForestClassifier(criterion='gini', n_estimators=700,
                             min_samples_split=10,min_samples_leaf=1,
                             max_features='auto',oob_score=True,
                             random_state=1,n_jobs=-1)
confusion_Rf  = list()
train_eval_Rf = list()
valid_eval_Rf = list()
variance_Rf   = list()
error_Rf      = list()

for i in range(1000):
    Strain_X , Svalid_X , Strain_y , Svalid_y = train_test_split( trainSet_X , trainSet_y , train_size = .8, test_size = .2   )
    Strain_y = np.ravel(Strain_y)
    model.fit(Strain_X, Strain_y)
    
    predicted_train = pd.DataFrame(model.predict(Strain_X),columns=['Prediction'],index=Strain_X.index)
    predicted_valid = pd.DataFrame(model.predict(Svalid_X),columns=['Prediction'],index=Svalid_X.index)

    train_score = accuracy_score(predicted_train,Strain_y)
    valid_score = accuracy_score(predicted_valid,Svalid_y)

    accuracy       = round(train_score*100,2)
    valid_accuracy = round(valid_score*100,2)
    mean_error     = round(mean_absolute_error(Svalid_y, predicted_valid), 2)

    mat = confusion_matrix(Svalid_y, predicted_valid)
    confusion_Rf.append(mat)
    
    train_eval_Rf.append(accuracy)
    valid_eval_Rf.append(valid_accuracy)
    variance_Rf.append(accuracy - valid_accuracy)
    error_Rf.append(mean_error)

Analisis de rendimiento de los modelos

In [None]:
evaluacion1 = np.hstack((evaluacion_train_logistic.values,evaluacion_train_SVC.values,evaluacion_train_DT.values,evaluacion_train_Gd.values,evaluacion_train_Rf.values))
evaluacion2 = np.hstack((evaluacion_valid_logistic.values,evaluacion_valid_SVC.values,evaluacion_valid_DT.values,evaluacion_valid_Gd.values,evaluacion_valid_Rf.values))
evaluacion3 = np.hstack((variacion_logistic.values,variacion_SVC.values,variacion_DT.values,variacion_Gd.values,variacion_Rf.values))
evaluacion4 = np.hstack((err_logistic.values,err_SVC.values,err_DT.values,err_Gd.values,err_Rf.values))

In [None]:
Train_eval = pd.DataFrame(evaluacion1,columns=['Logistic','SVM','Tree','Gradient','Forest'])
Valid_eval = pd.DataFrame(evaluacion2,columns=['Logistic','SVM','Tree','Gradient','Forest'])
Var_eval   = pd.DataFrame(evaluacion3,columns=['Logistic','SVM','Tree','Gradient','Forest'])
err_eval   = pd.DataFrame(evaluacion4,columns=['Logistic','SVM','Tree','Gradient','Forest'])

In [None]:
#Evaluacion para el conjunto de entrenamiento
Train_eval.head(100).plot.line()

In [None]:
#Evaluacion para el conjunto de validacion
Valid_eval.tail(10).plot.line()

In [None]:
#Evaluacion de variacion
Var_eval.head(20).plot.line()

In [None]:
#Evaluacion del error
err_eval.head(20).plot.line()

In [None]:
train_array = pd.DataFrame(np.array([Train_eval.describe().loc['mean'][0],Train_eval.describe().loc['mean'][1],Train_eval.describe().loc['mean'][2],Train_eval.describe().loc['mean'][3],Train_eval.describe().loc['mean'][4]]).transpose(),columns=['Train_score'],index=['Logistic','SVM','Tree','Gradient','Forest'])
valid_array = pd.DataFrame(np.array([Valid_eval.describe().loc['mean'][0],Valid_eval.describe().loc['mean'][1],Valid_eval.describe().loc['mean'][2],Valid_eval.describe().loc['mean'][3],Valid_eval.describe().loc['mean'][4]]).transpose(),columns=['Valid_score'],index=['Logistic','SVM','Tree','Gradient','Forest'])
var_array   = pd.DataFrame(np.array([Var_eval.describe().loc['mean'][0],Var_eval.describe().loc['mean'][1],Var_eval.describe().loc['mean'][2],Var_eval.describe().loc['mean'][3],Var_eval.describe().loc['mean'][4]]).transpose(),columns=['Generalization'],index=['Logistic','SVM','Tree','Gradient','Forest'])
err_array   = pd.DataFrame(np.array([err_eval.describe().loc['mean'][0],err_eval.describe().loc['mean'][1],err_eval.describe().loc['mean'][2],err_eval.describe().loc['mean'][3],err_eval.describe().loc['mean'][4]]).transpose(),columns=['Error'],index=['Logistic','SVM','Tree','Gradient','Forest'])

In [None]:
promedios = pd.concat([train_array,valid_array,var_array,err_array],axis=1)

Resumen de rendimiento de cada modelo

In [None]:
promedios