In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.svm import SVC

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV



In [3]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
X = data.drop('Survived', axis = 1)
y = data.Survived

In [5]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [6]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

In [7]:
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [8]:
# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [9]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [27]:
def ajuste(n, X_train = X_train, X_valid = X_valid, y_train = y_train, y_valid = y_valid):
    model = GradientBoostingClassifier(n_iter_no_change = 2, n_estimators=500, learning_rate=1.0, max_depth=1, random_state=0)
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
    my_pipeline.fit(X_train, y_train)
    preds = my_pipeline.predict(X_valid)
    score = mean_absolute_error(y_valid, preds)
    return score

for n in 1, 2, 5, 10, 50:
    print(f' n_iter_no_change = {n} o MAE fica = {ajuste(n)}')

 n_iter_no_change = 1 o MAE fica = 0.2011173184357542
 n_iter_no_change = 2 o MAE fica = 0.17318435754189945
 n_iter_no_change = 5 o MAE fica = 0.1787709497206704
 n_iter_no_change = 10 o MAE fica = 0.17318435754189945
 n_iter_no_change = 50 o MAE fica = 0.21787709497206703


In [None]:
# Obs: NÃO DEVO ALTERAR:
max_leaf_nodes
n_jobs
min_samples_leaf
min_samples_split
max_depth


In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [None]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [None]:

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)
# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

Prevendo o TEST

In [None]:
#retirar 3 colunas do test:
test2 = test[my_cols].copy()

preds_test = my_pipeline.predict(test2)

Salvar para submeter

In [None]:
# Cria e salva arquivo para submissão
salvar = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': preds_test})
#salvar.to_csv('submission_4.csv', index=False)
#print("Your submission was successfully saved!")

In [None]:
salvar.head(10)