# Import

In [2]:
import numpy as np
import pandas as pd 
import sys
import operator

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
import seaborn as sns

from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

import pprint

# Leitura dos Dados

In [3]:
conjuntoTreinamento = pd.read_csv("conjunto_dados\\falencia-treinamento.csv", delimiter=';')
x_test = pd.read_csv("conjunto_dados\\falencia-teste.csv", delimiter=';')

conjuntoTreinamento.drop("Unnamed: 0", inplace=True, axis=1)

x_train = conjuntoTreinamento.drop("Resultado", axis=1)
y_train = conjuntoTreinamento["Resultado"]

x_test.drop("Unnamed: 0", inplace=True, axis=1)

# Regressão Logística (Logistic Regression)

In [10]:
def printResultadosLogisticRegression(solver, penalty, tipoKfold, scoring, n_splits, x_train, y_train):
    if(penalty == None):
        print(">> Modelo : Logistic Regression " + "; Solver : " + solver + " ; KFold : " + tipoKfold + " ; Penalidade : None ; Scoring : " + scoring + " ; Splits : " + str(n_splits))
    else:
        print(">> Modelo : Logistic Regression " + "; Solver : " + solver + " ; KFold : " + tipoKfold + " ; Penalidade : " + penalty + " ; Scoring : " + scoring + " ; Splits : " + str(n_splits))


    modelLogistic = LogisticRegression(solver=solver, penalty=penalty)
    modelLogistic.fit(x_train, y_train)
    y_pred = modelLogistic.predict(x_test)
    
    if (tipoKfold == 'KFold'):
        cv = KFold(n_splits=n_splits, shuffle=True)
    elif (tipoKfold == 'StratifiedKFold'):
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True)

    vec = []
    for i in range (10):
        scoresModel = cross_val_score(modelLogistic, x_train, y_train, cv=cv, scoring=scoring)
        vec.append(scoresModel)
        # print(scoresModel)
    
    result = np.mean(vec, axis=0)
    
    return np.mean(result), np.std(result)

In [11]:
def logisticRegressionAllCombinations():
    solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    penalties = ['l1', 'l2', None] # elasticnet not working
    kfolds = ["KFold", "StratifiedKFold"]
    splits = [10, 20]

    #    - 'lbfgs'           -   ['l2', None]
    #    - 'liblinear'       -   ['l1', 'l2']
    #    - 'newton-cg'       -   ['l2', None]
    #    - 'newton-cholesky' -   ['l2', None]
    #    - 'sag'             -   ['l2', None]
    #    - 'saga'            -   ['elasticnet', 'l1', 'l2', None]

    logisticRegressionResults = []
    for solver in solvers:
        for penalty in penalties:
            if (solver == 'lbfgs' and penalty != 'l2' and penalty != None):
            
                continue
            if(solver == 'liblinear' and penalty != 'l1' and penalty != 'l2'):
                continue
            if(solver == 'newton-cg' or solver == 'newton-cholesky' or solver == 'sag' and penalty != 'l2' and penalty != None):
                continue
            for kf in kfolds:
                for split in splits:
                    logisticRegressionResults.append({"solver": solver, "penalty": penalty, "kfold": kf, "splits": split, "result": printResultadosLogisticRegression(solver, penalty, kf, "f1", split, x_train, y_train)})
    return logisticRegressionResults

In [None]:
for i in range(10):
    r = logisticRegressionAllCombinations()
    r.sort(key=operator.itemgetter("result"), reverse=True)
    with open("resultados/logisticRegression/logisticRegression" + str(i+1) + ".txt", "w") as file:
        pprint.pprint(r, stream=file, sort_dicts=False)

# Árvore de Decisão (Decision Tree)

In [3]:
def printResultadosDecisionTree(criterion, splitter, max_depth, tipoKfold, scoring, n_splits, x_train, y_train):
    print(">> Modelo : Decision Tree " +  "; Criterion : " + criterion + " ; Splitter : " + splitter + " ; Max_depth : " + str(max_depth) + " ; KFold : " + tipoKfold + " ; Scoring : " + scoring + " ; Splits : " + str(n_splits))

    modelDecisionTree = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth)
    modelDecisionTree.fit(x_train, y_train)
    y_pred = modelDecisionTree.predict(x_test)

    if (tipoKfold == "KFold"):
        cv = KFold(n_splits=n_splits, shuffle=True)
    elif (tipoKfold == "StratifiedKFold"):
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True)

    vec = []
    for i in range (10):
        scoresModel = cross_val_score(modelDecisionTree, x_train, y_train, cv=cv, scoring=scoring)
        vec.append(scoresModel)
        # print(scoresModel)
    
    result = np.mean(vec, axis=0)
    
    return np.mean(result), np.std(result)

In [6]:
def decisionTreeAllCombinations():
    criterion = ["gini", "entropy", "log_loss"]
    splitter = ["best", "random"]
    max_depth = [5, 10]
    kfold = ["KFold", "StratifiedKFold"]
    n_splits = [10, 20]

    decisionTreeResults = []
    for crit in criterion:
        for split in splitter:
            for depth in max_depth:
                for kf in kfold:
                    for nsplit in n_splits:
                        decisionTreeResults.append({"criterion": crit, "splitter": split, "kfold": kf, "max_depth": depth, "splits" : nsplit, "result": printResultadosDecisionTree(crit, split, depth, kf, "f1", nsplit, x_train, y_train)})

    return decisionTreeResults

In [None]:
for i in range(10):
    r = decisionTreeAllCombinations()
    r.sort(key=operator.itemgetter("result"), reverse=True)
    with open("resultados/decisionTree/decisionTree" + str(i+1) + ".txt", "w") as file:
        pprint.pprint(r, stream=file, sort_dicts=False)

In [None]:
modelDecisionTree = printResultadosDecisionTree(criterion="gini", splitter="best", max_depth = None, tipoKfold="KFold", scoring="accuracy", n_splits=10, x_train=x_train, y_train=y_train)
dot_data = tree.export_graphviz(modelDecisionTree, out_file=None, 
                                feature_names=x_train.columns,  
                                # class_names=iris.target_names,
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png")
display(graph)

# Árvores aleatórias (Random Forest)

In [18]:
def printResultadosRandomForest(n_estimators, criterion, max_depth, tipoKfold, scoring, n_splits, x_train, y_train):
    print(">> Modelo : Random Forest " + "; Estimators : " + str(n_estimators) + " ; Criterion : " + criterion + " ; Max_depth : " + str(max_depth) + " ; KFold : " + tipoKfold + " ; Scoring : " + scoring + " ; Splits : " + str(n_splits))

    modelRandomForest = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    modelRandomForest.fit(x_train, y_train)
    y_pred = modelRandomForest.predict(x_test)

    if (tipoKfold == "KFold"):
        cv = KFold(n_splits=n_splits, shuffle=True)
    elif (tipoKfold == "StratifiedKFold"):
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True)

    vec = []
    for i in range (10):
        scoresModel = cross_val_score(modelRandomForest, x_train, y_train, cv=cv, scoring=scoring)
        vec.append(scoresModel)
        # print(scoresModel)
    
    result = np.mean(vec, axis=0)
    
    return np.mean(result), np.std(result)

In [19]:
def randomForestAllCombinations():
    estimators = [100, 150]
    criterions = ['gini', 'log_loss']
    max_depths = [10, 15]
    kfolds = ['StratifiedKFold']
    splits = [10, 20]

    randomForestResults = []
    for estimator in estimators:
        for criterion in criterions:
            for max_depth in max_depths:
                for kfold in kfolds:
                    for split in splits:
                        randomForestResults.append({"estimator": estimator, "criterion": criterion, "kfold": kfold, "max_depth": max_depth, "splits" : split, "result": printResultadosRandomForest(estimator, criterion, max_depth, kfold, 'f1', split, x_train, y_train)})

    return randomForestResults

In [None]:
for i in range(5):
    r = randomForestAllCombinations()
    r.sort(key=operator.itemgetter("result"), reverse=True)
    with open("resultados/randomForest/randomForest" + str(i+1) + ".txt", "w") as file:
        pprint.pprint(r, stream=file, sort_dicts=False)

In [None]:
modelRandomForest = printResultadosRandomForest(100, 'gini', None, 'KFold', 'accuracy', 10, x_train, y_train)
for i in range(3):
    tree = modelRandomForest.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=x_train.columns,  
                               filled=True,  
                               max_depth=20, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

# Testes de Subconjuntos

In [5]:
# Testes do modelo que gerou os resultados enviados para submissão

modelRandomForest = RandomForestClassifier(n_estimators=150, criterion="log_loss", max_depth=15)

# Atributos considerados relevantes: I1, I5, I7, I8, I9, I10, M1, M2, M3, M4, M5, M8, M10

nx_train = x_train.drop("I2", axis=1)
nx_train = nx_train.drop("I3", axis=1)
nx_train = nx_train.drop("I4", axis=1)
nx_train = nx_train.drop("I6", axis=1)
nx_train = nx_train.drop("M6", axis=1)
nx_train = nx_train.drop("M7", axis=1)
nx_train = nx_train.drop("M9", axis=1)

modelRandomForest.fit(nx_train, y_train)

cv = StratifiedKFold(n_splits=20, shuffle=True)

vec = []
for i in range (10):
    scoresModel = cross_val_score(modelRandomForest, nx_train, y_train, cv=cv, scoring="f1")
    vec.append(scoresModel)
    # print(scoresModel)

result = np.mean(vec, axis=1)

print("Média e desvio padrão do modelo com scoring=\"f1\"")
print((np.mean(result), np.std(result)))
print()

vec = []
for i in range (10):
    scoresModel = cross_val_score(modelRandomForest, nx_train, y_train, cv=cv, scoring="f1_weighted")
    vec.append(scoresModel)
    # print(scoresModel)

result = np.mean(vec, axis=1)

print("Média e desvio padrão do modelo com scoring=\"f1_weighted\"")
print((np.mean(result), np.std(result)))
print()

nx_test = x_test.drop("I2", axis=1)
nx_test = nx_test.drop("I3", axis=1)
nx_test = nx_test.drop("I4", axis=1)
nx_test = nx_test.drop("I6", axis=1)
nx_test = nx_test.drop("M6", axis=1)
nx_test = nx_test.drop("M7", axis=1)
nx_test = nx_test.drop("M9", axis=1)

y_pred = modelRandomForest.predict(nx_test)

df = pd.DataFrame(y_pred, columns=["Resultado"])

df.to_csv("submissao.csv", sep=";")
print("Respostas do modelo para conjunto de teste:")
print(df)

Média e desvio padrão do modelo com scoring="f1"
(0.7174583194583194, 0.007528431223232198)

Média e desvio padrão do modelo com scoring="f1_weighted"
(0.8293554811626391, 0.004388782700671246)

Respostas do modelo para conjunto de teste:
    Resultado
0           1
1           0
2           0
3           1
4           0
..        ...
95          0
96          1
97          0
98          0
99          0

[100 rows x 1 columns]


In [None]:
# Aqui são apenas outros testes realizados

model = RandomForestClassifier(n_estimators=150, criterion='log_loss', max_depth=15)
# nx_train = x_train.drop("M8", axis=1)
# nx_train = x_train.drop("M6", axis=1)
# nx_train = x_train.drop("I6", axis=1)
# nx_train = x_train.drop("I3", axis=1)
nx_train = x_train.drop("I1", axis=1)
nx_train = nx_train.drop("I4", axis=1)
nx_train = nx_train.drop("I6", axis=1)
nx_train = nx_train.drop("I7", axis=1)
nx_train = nx_train.drop("I10", axis=1)
nx_train = nx_train.drop("M4", axis=1)
nx_train = nx_train.drop("M5", axis=1)
nx_train = nx_train.drop("M6", axis=1)
nx_train = nx_train.drop("M8", axis=1)
model.fit(nx_train, y_train)
print(model.feature_importances_)

# nx_test = x_test.drop("M8", axis=1)
# nx_test = x_test.drop("M6", axis=1)
# nx_test = x_test.drop("I6", axis=1)
# nx_test = x_test.drop("I3", axis=1)
nx_test = x_test.drop("I1", axis=1)
nx_test = nx_test.drop("I4", axis=1)
nx_test = nx_test.drop("I6", axis=1)
nx_test = nx_test.drop("I7", axis=1)
nx_test = nx_test.drop("I10", axis=1)
nx_test = nx_test.drop("M4", axis=1)
nx_test = nx_test.drop("M5", axis=1)
nx_test = nx_test.drop("M6", axis=1)
nx_test = nx_test.drop("M8", axis=1)
y_pred = model.predict(nx_test)

cv = StratifiedKFold(n_splits=20, shuffle=True)
scoresModel = cross_val_score(model, x_train, y_train, cv=cv, scoring='f1')

print(np.mean(scoresModel), np.std(scoresModel))

# Métricas

In [None]:
conjuntoTreinamento.info()
conjuntoTreinamento.describe()

In [None]:
sns.pairplot(conjuntoTreinamento, hue="Resultado")