In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# check version number
import imblearn
print(imblearn.__version__)

0.7.0


In [3]:
from sklearn.datasets import fetch_openml

df, y = fetch_openml('adult', version=2, as_frame=True, return_X_y=True)
# we are dropping the following features:
# - "fnlwgt": this feature was created while studying the "adult" dataset.
#   Thus, we will not use this feature which is not acquired during the survey.
# - "education-num": it is encoding the same information than "education".
#   Thus, we are removing one of these 2 features.
df = df.drop(columns=['fnlwgt', 'education-num'])

In [5]:
df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25.0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States
1,38.0,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States
2,28.0,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States
3,44.0,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States
4,18.0,,Some-college,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States


In [6]:
classes_count = y.value_counts()
classes_count

<=50K    37155
>50K     11687
Name: class, dtype: int64

In [7]:
from imblearn.datasets import make_imbalance

ratio = 30
df_res, y_res = make_imbalance(
    df, y, sampling_strategy={
        classes_count.idxmin(): classes_count.max() // ratio
    }
)
y_res.value_counts()

<=50K    37155
>50K      1238
Name: class, dtype: int64

In [32]:
df_res.isnull().sum()

age                  0
workclass         2557
education            0
marital-status       0
occupation        2567
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     664
dtype: int64

In [45]:
X = df_res
y= y_res.values

In [46]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print("X.shape: {} y.shape: {}".format(X.shape, y.shape))

X.shape: (38393, 12) y.shape: (38393,)


Podzielmy zbiór na train/test

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [76]:
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))

(array([0, 1]), array([29725,   989], dtype=int64))
(array([0, 1]), array([7430,  249], dtype=int64))


# Zad

Robimy StratifiedKFold i znajdujemy optymalne parametry dla



* MultinomialNB (bez redukcji wymiarowości)
* LogisticRegression
* LinearSVC
* SVC
* KNeighborsClassifier
* DecisionTreeClassifier
* RandomForestClassifier
* BaggingClassifier
* ExtraTreesClassifier
* AdaBoostClassifier
* GradientBoostingClassifier
* VotingClassifier
* xgboost.XGBClassifier

## Dane są niezbalansowane, wykorzystaj 

```python
from imblearn.over_sampling import SMOTE, ADASYN
```

{'classifier__C': 0.001}

In [70]:
from sklearn import  metrics


models = []
models.append(('LR', grid_1.best_estimator_))



precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
b_accuracy_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test, model.predict(X_test)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test, model.predict(X_test)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test, model.predict(X_test)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test, model.predict(X_test)) ))
    precision_score.append(metrics.precision_score(y_test, model.predict(X_test)))
    recall_score.append(metrics.recall_score(y_test, model.predict(X_test)))
    f1_score.append( metrics.f1_score(y_test, model.predict(X_test)))
    accuracy_score.append(metrics.accuracy_score(y_test, model.predict(X_test)))
    
    
    b_accuracy_score.append(metrics.balanced_accuracy_score(y_test, model.predict(X_test)))

LR
precision_score: 0.07948243992606285
recall_score: 0.6907630522088354
f1_score: 0.14256112722751763
accuracy_score: 0.730563875504623


In [72]:
import pandas as pd
d = {'precision_score': precision_score, 
     'recall_score': recall_score, 
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score,
     'balanced_accuracy_score' : b_accuracy_score
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['LR'])
df

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy_score,balanced_accuracy_score
0,LR,0.079482,0.690763,0.142561,0.730564,0.71133
