In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
pd.set_option('display.max_columns',None)

In [2]:
df = pd.read_csv(r'C:\Users\marek\OneDrive\Pulpit\Inzynierka\bank-additional-full.csv', sep=';')
columns_to_drop = ['duration','euribor3m','emp.var.rate','cons.price.idx','cons.conf.idx','nr.employed']
for column in columns_to_drop:
    df.drop(column, axis=1, inplace=True)
columns_to_replace_unknown = df.columns.difference(['default','housing','loan'])
df[columns_to_replace_unknown] = df[columns_to_replace_unknown].copy().replace('unknown', np.nan)

In [3]:
categorical_for_dummies = ['job', 'marital']
[df.dropna(subset=x, inplace=True) for x in categorical_for_dummies]

df['month'].replace(['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'],
                     [1,2,3,4,5,6,7,8,9,10,11,12], inplace=True)
df['day_of_week'].replace(['mon','tue','wed','thu','fri'], [1,2,3,4,5], inplace=True)
df['education'].replace(['illiterate','basic.4y','basic.6y','basic.9y','high.school','professional.course',
                         'university.degree'], [0,1,2,3,4,5,6], inplace=True)
df['education'].fillna(df['education'].mean(), inplace=True)
quantile_995 = df['campaign'].quantile(0.995)
df = df[df['campaign'] <= quantile_995]
df['not_contacted_before'] = df['pdays'] == 999
df['pdays'] = df['pdays'].apply(lambda x: np.log10(x) if x > 0 else 0)
df['y'].replace(['no', 'yes'], [0, 1], inplace=True)

X = df.drop('y', axis=1)
Y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [4]:
def identity_transform(x):
    return x

transform = [
    ('identity', FunctionTransformer(identity_transform), ['pdays', 'not_contacted_before']),
    ('min_max_scaler', MinMaxScaler(), ['age', 'education', 'month', 'day_of_week', 'campaign', 'previous', 'education']),
    ('one_hot_encoder', OneHotEncoder(), ['default', 'housing', 'loan', 'contact', 'poutcome', 'job', 'marital'])
]
column_transformer = ColumnTransformer(transform)

In [5]:
voting_clf = VotingClassifier(
    estimators=[
        ('svc', SVC(class_weight='balanced', random_state=42, probability=True)),
        ('dtc',DecisionTreeClassifier(max_depth = 10,class_weight='balanced', random_state=42)),
        ('gnb',GaussianNB())
    ]
)
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', voting_clf)
])
pipeline_part1 = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', SVC(probability=True, class_weight='balanced', random_state=42))
])
pipeline_part2 = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', DecisionTreeClassifier(max_depth = 10, class_weight='balanced', random_state=42))
])
pipeline_part3 = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', GaussianNB())
])

In [6]:
pip = pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Macierz pomyłek:")
print(conf_matrix)
print(classification_report(y_test, y_pred))

Macierz pomyłek:
[[2643  970]
 [ 172  276]]
              precision    recall  f1-score   support

           0       0.94      0.73      0.82      3613
           1       0.22      0.62      0.33       448

    accuracy                           0.72      4061
   macro avg       0.58      0.67      0.57      4061
weighted avg       0.86      0.72      0.77      4061



In [7]:
pip1 = pipeline_part1.fit(X_train, y_train)
pip2 = pipeline_part2.fit(X_train, y_train)
pip3 = pipeline_part3.fit(X_train, y_train)

joblib.dump(pip, 'main_model.pkl')
joblib.dump(pip1, 'pip1_model.pkl')
joblib.dump(pip2, 'pip2_model.pkl')
joblib.dump(pip3, 'pip3_model.pkl')

PicklingError: Can't pickle <function <lambda> at 0x00000239AA7DE020>: it's not found as __main__.<lambda>