In [3]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, KFold

In [4]:
X_full = pd.read_csv('classes_dataset.csv', dtype={col: np.float32 for col in ['lcc', 'lcom*']})

In [5]:
X_full.head()

Unnamed: 0.1,Unnamed: 0,dit,fanin,fanout,lcc,lcom*,loc,noc,rfc,innerClassesQty,totalFieldsQty,totalMethodsQty,wmc,godClass,refusedBequest
0,0,4,1,6,0.0,0.592593,39,0,12,0,3,9,10,0,0
1,1,4,2,7,0.0,0.0,18,0,6,0,0,4,6,0,0
2,2,1,0,0,0.0,1.0,56,0,0,3,4,4,4,0,0
3,3,1,1,0,0.0,0.0,4,0,0,0,0,2,2,0,0
4,4,3,3,5,0.0,0.0,16,0,7,0,0,4,4,0,0


In [6]:
X = X_full.drop(['godClass'], axis=1)
# retirar outro smell
X = X.drop(['refusedBequest'], axis=1)
y = X_full.godClass

In [9]:
X.head()

Unnamed: 0.1,Unnamed: 0,dit,fanin,fanout,lcc,lcom*,loc,noc,rfc,innerClassesQty,totalFieldsQty,totalMethodsQty,wmc
0,0,4,1,6,0.0,0.592593,39,0,12,0,3,9,10
1,1,4,2,7,0.0,0.0,18,0,6,0,0,4,6
2,2,1,0,0,0.0,1.0,56,0,0,3,4,4,4
3,3,1,1,0,0.0,0.0,4,0,0,0,0,2,2
4,4,3,3,5,0.0,0.0,16,0,7,0,0,4,4


In [14]:
#avaliar com calma
X.fillna(0, inplace=True)

In [15]:
np.any(np.isnan(X))

False

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50930 entries, 0 to 50929
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       50930 non-null  int64  
 1   dit              50930 non-null  int64  
 2   fanin            50930 non-null  int64  
 3   fanout           50930 non-null  int64  
 4   lcc              50930 non-null  float32
 5   lcom*            50930 non-null  float32
 6   loc              50930 non-null  int64  
 7   noc              50930 non-null  int64  
 8   rfc              50930 non-null  int64  
 9   innerClassesQty  50930 non-null  int64  
 10  totalFieldsQty   50930 non-null  int64  
 11  totalMethodsQty  50930 non-null  int64  
 12  wmc              50930 non-null  int64  
dtypes: float32(2), int64(11)
memory usage: 4.7 MB


In [74]:
## PIPELINE

In [18]:
SEED=0
decision_tree_regressor = DecisionTreeRegressor(random_state=SEED, criterion='mse')

In [20]:
pipe = Pipeline(steps=[
            ('minmaxscaler', MinMaxScaler()),
            ('stdscaler',  StandardScaler()),
            ('regressor', decision_tree_regressor)
    ])

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=3)

In [22]:
# https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/
parameters = dict(
    regressor__max_depth= [2,3,5,7,9,11,13],
    regressor__min_samples_split= [32,64,128,256],
    regressor__min_samples_leaf= [32,64,128,256])

In [25]:
randomized_search = RandomizedSearchCV(pipe,
                              parameters,
                              cv = KFold(n_splits=5),
                                        n_iter=20,
                                      random_state=11)

In [24]:
# Exporta precisão? Acurácia? Etc...
randomized_search.fit(X, y)

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                             ('stdscaler', StandardScaler()),
                                             ('regressor',
                                              DecisionTreeRegressor(random_state=0))]),
                   n_iter=20,
                   param_distributions={'regressor__max_depth': [2, 3, 5, 7, 9,
                                                                 11, 13],
                                        'regressor__min_samples_leaf': [32, 64,
                                                                        128,
                                                                        256],
                                        'regressor__min_samples_split': [32, 64,
                                                                         128,
                                           

In [26]:
resultados = pd.DataFrame(randomized_search.cv_results_)
resultados.head()

AttributeError: 'RandomizedSearchCV' object has no attribute 'cv_results_'

In [88]:
# qual acuracia e precisao?
randomized_search.best_params_

{'regressor__min_samples_split': 256,
 'regressor__min_samples_leaf': 32,
 'regressor__max_depth': 7}

In [90]:
## Teste

In [91]:
test_predictions = pipe.predict(X_val)

In [92]:
test_predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [95]:
output = pd.DataFrame({'godClass': test_predictions})

In [96]:
output.describe()

Unnamed: 0,godClass
count,10186.0
mean,0.021107
std,0.143749
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0
