In [None]:
%config Completer.use_jedi = False

In [None]:
from pathlib import Path
import importlib.util

import logging
logging.getLogger().setLevel(logging.INFO)

from sklearn import tree
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, balanced_accuracy_score
from sklearn_pandas import DataFrameMapper

import pandas as pd

import predictsignauxfaibles.models
from predictsignauxfaibles.data import SFDataset
from predictsignauxfaibles.config import OUTPUT_FOLDER, IGNORE_NA
from predictsignauxfaibles.pipelines import run_pipeline
from predictsignauxfaibles.utils import load_conf

In [None]:
conf = load_conf("default")

In [None]:
train = conf.TRAIN_DATASET
train.sample_size = 1e4

test = conf.TEST_DATASET
test.sample_size = 1e4

In [None]:
savepath = None # change it to be a filepath if you wish to save train and test data locally

train.fetch_data().raise_if_empty()
test.fetch_data().raise_if_empty()
logging.info("Succesfully loaded Features data from MongoDB")

if savepath is not None:
    train.data.to_csv(f"{savepath}_train.csv")
    test.data.to_csv(f"{savepath}_test.csv")
    logging.info(f"Saved Features extract to {savepath}")

In [None]:
train_siren_set = train.data["siren"].unique().tolist()
test.remove_siren(train_siren_set)

In [None]:
train.replace_missing_data().remove_na(ignore=IGNORE_NA)
train.data = run_pipeline(train.data, conf.TRANSFO_PIPELINE)

test.replace_missing_data().remove_na(ignore=IGNORE_NA)
test.data = run_pipeline(test.data, conf.TRANSFO_PIPELINE)

In [None]:
train.data

In [None]:
train = train.data

In [None]:
test = test.data

In [None]:
train.loc[train['outcome']==False].isnull().sum()

In [None]:
train.drop(['paydex_group','siret','periode','outcome','siren','code_naf','time_til_outcome'], axis=1)

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(train.drop(['paydex_group','siret','periode','outcome','siren','code_naf','time_til_outcome'], axis=1),train['outcome'])

In [None]:
test['outcome'] = logreg.predict(test.drop(['paydex_group','siret','periode','outcome','siren','code_naf','time_til_outcome'], axis=1))

In [None]:
train.loc[train['outcome']==True]

In [None]:
train.shape

In [None]:
dectree = tree.DecisionTreeClassifier(max_depth=5)

In [None]:
dectree.fit(train.drop(['paydex_group','siret','periode','outcome','siren','code_naf','time_til_outcome'], axis=1),train['outcome'])

In [None]:
tree.plot_tree(dectree) 

In [None]:
Y = dectree.predict(test.drop(['paydex_group','siret','periode','outcome','siren','code_naf','time_til_outcome'], axis=1))

In [None]:
decision_tree = tree.DecisionTreeClassifier(random_state=0, max_depth=5)
decision_tree = decision_tree.fit(train.drop(['paydex_group','siret','periode','outcome','siren','code_naf','time_til_outcome'], axis=1), train['outcome'])
r = tree.export_text(decision_tree, feature_names=list(train.drop(['paydex_group','siret','periode','outcome','siren','code_naf','time_til_outcome'], axis=1).columns))
print(r)