# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
np.random.seed(0)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, minmax_scale, MaxAbsScaler, StandardScaler, RobustScaler, Normalizer, QuantileTransformer, PowerTransformer
from sklearn.decomposition import PCA


In [None]:
trainData = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
testData = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

trainData.head()

In [None]:
trainDataCopy = trainData.copy()
trainDataCopy = trainDataCopy.drop( 'id', axis=1)

In [None]:
def score_dataset(X_train, X_valid, y_train, y_valid, est, lrate, model_type):
    if model_type == 'classifier':
        model = XGBClassifier(n_estimators=est, learning_rate=lrate, n_jobs=4, tree_method='gpu_hist')
    else:
        model = XGBRegressor(n_estimators=est, learning_rate=lrate, n_jobs=4, tree_method='gpu_hist')

    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    score = roc_auc_score(y_valid, preds)
    return score

# PCA sem a feature n_missing e todos os componentes


In [None]:
X = trainDataCopy.drop(['claim'], axis=1)
y = trainDataCopy['claim']

In [None]:
imputer = SimpleImputer(strategy='median')
imputed_X = pd.DataFrame(imputer.fit_transform(X))
imputed_X.columns = X.columns

scl = QuantileTransformer(output_distribution='uniform') # Transformação com melhores resultados
pca = PCA()
X_scaled = scl.fit_transform(imputed_X)
X_pca = pca.fit_transform(X_scaled)
    
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_train, X_valid, y_train, y_valid = train_test_split(X_pca, y, train_size=0.7, test_size=0.3,
                                                        random_state=0)

print(score_dataset(X_train, X_valid, y_train, y_valid, 1500, 0.02, 'regressor')) 

In [None]:
np.cumsum(pca.explained_variance_ratio_)
pca.n_components_

# PCA sem a feature n_missing e variância acumulada de 0.95


In [None]:
X = trainDataCopy.drop(['claim'], axis=1)
y = trainDataCopy['claim']

In [None]:
imputer = SimpleImputer(strategy='median') 
imputed_X = pd.DataFrame(imputer.fit_transform(X))
imputed_X.columns = X.columns

scl = QuantileTransformer(output_distribution='uniform') # Transformação com melhores resultados
pca = PCA(0.95)
X_scaled = scl.fit_transform(imputed_X)
X_pca = pca.fit_transform(X_scaled)
    
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_train, X_valid, y_train, y_valid = train_test_split(X_pca, y, train_size=0.7, test_size=0.3,
                                                        random_state=0)

print(score_dataset(X_train, X_valid, y_train, y_valid, 1500, 0.02, 'regressor')) 

In [None]:
np.cumsum(pca.explained_variance_ratio_)
pca.n_components_

# PCA com a feature n_missing e todos os componentes


In [None]:
X = trainDataCopy.drop(['claim'], axis=1)
X['n_missing'] = trainDataCopy.isnull().sum(axis=1)
y = trainDataCopy['claim']


In [None]:
imputer = SimpleImputer(strategy='median') 
imputed_X = pd.DataFrame(imputer.fit_transform(X))
imputed_X.columns = X.columns

scl = QuantileTransformer(output_distribution='uniform') # Transformação com melhores resultados
pca = PCA()
X_scaled = scl.fit_transform(imputed_X)
X_pca = pca.fit_transform(X_scaled)
    
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_train, X_valid, y_train, y_valid = train_test_split(X_pca, y, train_size=0.7, test_size=0.3,
                                                        random_state=0)

print(score_dataset(X_train, X_valid, y_train, y_valid, 1500, 0.02, 'regressor')) 

In [None]:
np.cumsum(pca.explained_variance_ratio_)
pca.n_components_

# PCA com a feature n_missing e variância acumulada de 0.95

In [None]:
X = trainDataCopy.drop(['claim'], axis=1)
X['n_missing'] = trainDataCopy.isnull().sum(axis=1)
y = trainDataCopy['claim']


In [None]:
imputer = SimpleImputer(strategy='median') 
imputed_X = pd.DataFrame(imputer.fit_transform(X))
imputed_X.columns = X.columns

scl = QuantileTransformer(output_distribution='uniform') # Transformação com melhores resultados
pca = PCA(0.95)
X_scaled = scl.fit_transform(imputed_X)
X_pca = pca.fit_transform(X_scaled)
    
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_train, X_valid, y_train, y_valid = train_test_split(X_pca, y, train_size=0.7, test_size=0.3,
                                                        random_state=0)

print(score_dataset(X_train, X_valid, y_train, y_valid, 1500, 0.02, 'regressor'))

In [None]:
np.cumsum(pca.explained_variance_ratio_)
pca.n_components_

# Arquivo de submissão da melhor abordagem

In [None]:
X = trainDataCopy.drop(['claim'], axis=1)
X['n_missing'] = trainDataCopy.isnull().sum(axis=1)
y = trainDataCopy['claim']

imputer = SimpleImputer(strategy='median') 
imputed_X = pd.DataFrame(imputer.fit_transform(X))
imputed_X.columns = X.columns

scl = QuantileTransformer(output_distribution='uniform') # Transformação com melhores resultados
pca = PCA()
X_scaled = scl.fit_transform(imputed_X)
X_pca = pca.fit_transform(X_scaled)
    
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_train, X_valid, y_train, y_valid = train_test_split(X_pca, y, train_size=0.7, test_size=0.3,
                                                        random_state=0)

print(score_dataset(X_train, X_valid, y_train, y_valid, 1500, 0.02, 'regressor'))

In [None]:
testDataCopy = testData.copy().drop('id', axis=1)
testDataCopy['n_missing'] = testData.isnull().sum(axis=1)

In [None]:
imputer = SimpleImputer(strategy='median')
imputed_X_test = pd.DataFrame(imputer.fit_transform(testDataCopy))
imputed_X_test.columns = testDataCopy.columns

scl = QuantileTransformer(output_distribution='uniform')
pca = PCA()
X_test_scaled = scl.fit_transform(imputed_X_test)
X_test_pca = pca.fit_transform(X_test_scaled)
component_names = [f"PC{i+1}" for i in range(X_test_pca.shape[1])]
X_test_pca = pd.DataFrame(X_test_pca, columns=component_names)

In [None]:
model = XGBRegressor(n_estimators=1500, learning_rate=0.02, n_jobs=4, tree_method='gpu_hist')
model.fit(X_train, y_train)
preds = model.predict(X_test_pca)
series = pd.Series(preds, index=testData['id'].astype('int'), name='claim') 
series.to_csv('output.csv')
series