Eliminar outliers

In [None]:
def outliers_IQR(data, col):
    
    lower_quartile = data[col].quantile(0.25)
    upper_quartile = data[col].quantile(0.75)
    IQR = upper_quartile - lower_quartile
    outlier_thresh = 1.5 * IQR

    return data[data[col].between((lower_quartile - outlier_thresh), (upper_quartile + outlier_thresh))]

def outliers_zsocre(data, col):
    import numpy as np

    z = np.abs(stats.zscore(data[col]))
    data = data[z <= 3]

    return data

Remover constant/duplicate columns

In [None]:
# removendo constant columns (no train)
colRemove = []
for col in train.columns:
    if train[col].std() == 0:
        colRemove.append(col)
train.drop(colRemove, axis=1, inplace=True)

# removendo duplicate columns
colRemove = []
columns = train.columns
for i in range(len(columns)-1):
    v = train[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,train[columns[j]].values):
            colRemove.append(columns[j])
train.drop(colRemove, axis=1, inplace=True)

Criar novas features usando o featuretools

In [None]:
from sklearn.datasets import load_iris
import pandas as pd

# Load data and put into dataframe
iris = load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)

import featuretools as ft

# Make an entityset and add the entity
es = ft.EntitySet(id = 'iris')
es.entity_from_dataframe(entity_id = 'data', dataframe = df, 
                         make_index = True, index = 'index')

# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'data',
                                      trans_primitives = ['add_numeric'],
                                      agg_primitives=["mean", "max", "min", "std", "skew"])

print(feature_matrix.head())
print(feature_defs)

Label Encoder

In [28]:
# do label encoder in Day_of_week feature
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
train.Day_of_week = label.fit_transform(train.Day_of_week)

Normalization

In [32]:
def normalizer(df, cols):
    from sklearn.preprocessing import Normalizer

    scaler = Nomalizer()
    df[[cols]] = scaler.fit_transform(df[[cols]])
    
    return df

Pivot Table

In [None]:
df_pivot = df.pivot_table(index = ['col_name1'], columns = ['col_name2'], values = 'col_name3')
df_pivot = df_pivot.fillna(0).reset_index()

# Feature Selection

Fazendo a correlação

In [None]:
# criacao de dataframe de correlacao entre as features e a variavel objetivo
feature_corr = pd.DataFrame(train.corr()['TARGET'])
feature_corr['corr_abs'] = feature_corr['TARGET'].apply(lambda x: x if x>0 else abs(x))
feature_corr.columns = ['corr','corr_abs']
feature_corr.sort_values(by='corr_abs',ascending=False,inplace=True)
feature_corr.head()

Eliminando features com correlação maior que 0.9

In [None]:
def remove_predictor_correlated(data):
    print(f'Initial features: {len(data.columns)}')
    
    corr = data.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= 0.9:
                if columns[j]:
                    columns[j] = False
    
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    
    print(f'Final features: {len(data.columns)}')
    
    return data

X = remove_predictor_correlated(X)

Usando o AUC score

In [None]:
# variáveis importantes
numFeaturesInCombination = 5
numCombinations = 400
numBestSingleFeaturesToSelectFrom = 20

# pegar as features e criar o data frame q vai conter as AUC
featuresToUse = singleFeatureTable.iloc[0:numBestSingleFeaturesToSelectFrom-1,0]
featureColumnNames = ['feature'+str(x+1) for x in range(numFeaturesInCombination)]
featureCombinationsTable = pd.DataFrame(index=range(numCombinations), 
                                        columns=featureColumnNames + ['combinedAUC'])

for combination in range(numCombinations):
    # gerar combinações de 5 features
    randomSelectionOfFeatures = sorted(np.random.choice(len(featuresToUse), numFeaturesInCombination, 
                                                        replace=False))
    print(f'randomSelectionOfFeatures: {randomSelectionOfFeatures}')
    
    # pegar os nomes das features e adicionar no data frame q conterá as combinações e o AUC
    combinationFeatureNames = [featuresToUse[x] for x in randomSelectionOfFeatures]
    print(f'combinationFeatureNames: {combinationFeatureNames}')
    for i in range(len(randomSelectionOfFeatures)):
        featureCombinationsTable[combination,featureColumnNames[i]] = combinationFeatureNames[i]
        print(f'giro {i}, featureCombinationsTable: {featureCombinationsTable}')

    # pegar os valores da features que useremos
    trainInputFeatures = X_train[:,combinationFeatureNames]
    validInputFeatures = X_valid[:,combinationFeatureNames]
    
    # treinar o modelo
    modelCombination.fit(trainInputFeatures, y_train)
    
    # calcula e guarda os resultados da AUC
    validAUC = auc(y_valid, modelCombination.predict_proba(validInputFeatures)[:,1])        
    featureCombinationsTable[combination,'combinedAUC'] = validAUC

validAUC = np.array(featureCombinationsTable.loc[:,'combinedAUC'])
print("(min,max) AUC = (%.4f,%.4f)." % (validAUC.min(),validAUC.max()))

# mostra as 20 melhores combinações
featureCombinationsTable = featureCombinationsTable.sort_values(by='combinedAUC', axis=0, 
                                                                ascending=False).reset_index(drop=True)
featureCombinationsTable.iloc[:20,:]

# pegando as TOP 5
feature = []
for i in range(5):
    feature.append(list(featureCombinationsTable.iloc[i,:-1].values))
feature

Usando o SelectKBest

In [None]:
X = inpvar
y = outvar
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, f_classif
from sklearn.linear_model import Lasso

k_vs_score = []

for k in range(2,X.shape[1]+1,2):
    selector = SelectKBest(score_func=f_regression, k=k) # f_classif para modelos de classificação

    Xtrain2 = selector.fit_transform(X_train, y_train)
    Xval2 = selector.transform(X_test)

    mdl = Lasso(alpha=0.08885877289587728, normalize=True)
  
    mdl.fit(Xtrain2, y_train)

    y_pred = mdl.predict(Xval2)

    errors = metricas(y_test,y_pred)
    errors['k'] = k

    print(errors)

    k_vs_score.append(errors)

k_errors = pd.DataFrame(k_vs_score)
k_errors.set_index('k', inplace = True)

# plotar os gráficos
g1 = k_errors.iloc[:,:-1].plot(figsize=(10,7))
g2 = k_errors['medae'].plot(figsize=(10,7))

In [None]:
# treina com o melhor k e pega o nome das colunas
selector = SelectKBest(score_func=f_regression, k=best_k)
selector.fit(X_train, y_train)
inpvar_selected_columns = X_train.iloc[:, selector.get_support()].columns

Usando o SelectFromModel

In [None]:
X = inpvar
y = outvar
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

k_vs_score = []

for k in range(2, X_train.shape[1], 2):
    
    selector_model = Lasso(alpha=0.08885877289587728, normalize=True)
    selector = SelectFromModel(selector_model, max_features=k, threshold=-np.inf)

    selector.fit(X_train, y_train)

    Xtrain2 = selector.transform(X_train)
    Xval2 = selector.transform(X_test)

    mdl = Lasso(alpha=0.08885877289587728, normalize=True)

    mdl.fit(Xtrain2, y_train)

    y_pred = mdl.predict(Xval2)

    for i in range(len(y_pred)):
        if y_pred[i] < 0:
            y_pred[i] = abs(y_pred[i])

    errors = metricas(y_test,y_pred)
    errors['k'] = k

    print(errors)

    k_vs_score.append(errors)

k_errors = pd.DataFrame(k_vs_score)
k_errors.set_index('k', inplace = True)

k_errors.iloc[:,:-1].plot(figsize=(10,7))

print(f'min medae: {k_errors.medae.min()}')

In [None]:
# treina com o melhor k e pega o nome das colunas
selector = SelectFromModel(selector_model, max_features=k, threshold=-np.inf)
selector.fit(X_train, y_train)
inpvar_selected_columns = X_train.iloc[:, selector.get_support()].columns

Usando decision tree

In [40]:
from sklearn.tree import DecisionTreeRegressor

X = train.drop('lights', axis=1)
y = train[['lights']]

model = DecisionTreeRegressor()
model.fit(X, y)

# get importance
importance = model.feature_importances_

# get just importance feature
features = X.columns
im = importance < 0.03
cols = list(features[im].values)
cols.append('lights')
train = train[cols]

DecisionTreeRegressor()

# Machile Learning

### Modelos de Classificação

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
def metricas(y_val, y_pred):
    from sklearn.metric import confusion_matrix

    conf = confusion_matrix(y_cv, y_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp / tp+fn
    acc = (tp+tn) / (tn+fp+fn+tp)
    prec = tp / (tp+fp)
    
    return {'Confusion Matrix': conf, 'Sensibility': sens, 'Accuracy': acc, 'Precision': prec}

In [None]:
def class_methods(method, model, data):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split

    X = data.drop('TARGET', axis=1)
    y = data[['TARGET']]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
        
    model.fit(X_train, y_train)
       
    p = model.predict(X_val)
    param = model.get_params()
    results = metricas(y_val, p)
    
    print(method)
    print(results)
    
    results['Parametros'] = param
    results['Modelos'] = model
    results['Previsao'] = p
        
    return results

In [None]:
outliers = 'None' # None, ZScore, IQR, ZScore&IQR
obs = ''
#overall_results = pd.read_excel('result.xlsx')
overall_results = pd.DataFrame()

methods = ['logistica_reg', 'naiveBayes', 'randomForest', 
           'svc', 'lgbm', 'xgb']

for method in methods:

    if method == 'logistica_reg':
        model = LogisticRegression()
  
    elif method =='naiveBayes':
        model = GaussianNB()

    elif method == 'randomForest':
        model = RandomForestClassifier()
        
    elif method == 'svc':
        model = SVC()
        
    elif method == 'lgbm':
        model = LGBMClassifier()
            
    elif method == 'xgb':        
        model = XGBClassifier()
            
    results = class_methods(method,model,train,params)
    
    inf = pd.DataFrame({'metodo':method,
                        'outliers': outliers,
                        'observações': obs,
                       })
    
    result_all = pd.concat([inf,results], axis=1)
    overall_results = pd.concat([overall_results,result_all])

overall_results.to_excel('result.xlsx',index=False)

### Modelos de Regressão

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [None]:
def metricas(y_test, y_pred):
    import numpy as np
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import median_absolute_error
    from sklearn.metrics import mean_squared_log_error

    rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
    rmsle = (np.sqrt(mean_squared_log_error(y_test, y_pred)))
    mae = mean_absolute_error(y_test, y_pred)
    medae = median_absolute_error(y_test, y_pred)
    
    return {'rmse': rmse, 'rmsle': rmsle, 'mae': mae, 'medae': medae}

In [None]:
def reg_methods(method, model, data):
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import train_test_split

    X = data.drop('TARGET', axis=1)
    y = data[['TARGET']]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    
    if norm:
        cols = X_train.columns[:]
        X_train = normalizer(X_train, cols)
        
    model.fit(X_train, y_train)    
    p = model.predict(X_val)
    param = model.get_params()
    results = metricas(y_val, p)
    results_cross = cross_val_score(model, X, y, cv=5)
    
    print(method)
    print(results)
    print('Cross-Validation', np.mean(results_cross))
        
    return results

In [None]:
outliers = 'None' # None, ZScore, IQR, ZScore&IQR
obs = ''
NORM = 0
#overall_results = pd.read_excel('result.xlsx')
overall_results = pd.DataFrame()

methods = ['linear_regression','lasso','ridge','random_forest',
           'decision_tree','gradient_boosting','xgboost','elastic_net']

for method in methods:

    if method == 'linear_regression':
        model = LinearRegression()

    elif method == 'lasso':
        model = Lasso()

    elif method == 'ridge':
        model = Ridge()
        
    elif method == 'random_forest':
        reg = RandomForestRegressor()
        
    elif method == 'decision_tree':
        reg = DecisionTreeRegressor()
            
    elif method == 'gradient_boosting':        
        model = GradientBoostingRegressor()
            
    elif method == 'xgboost':
        model = XGBRegressor(objective="reg:squarederror")
    
    elif method == 'elastic_net':        
        reg = ElasticNet()
        
    results = reg_methods(method,model,train,NORM)
    
    results['Metodo'] = method
    results['Observaoções'] = obs
    results['Outliers'] = outliers
    
    results_df = pd.DataFrame(results, index=[0])
    
    overall_results = pd.concat([overall_results,results_df])
    
overall_results.to_excel('result.xlsx',index=False)

### Modelos de Aprendizagem Não Supervisionada