In [34]:
import joblib
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from import_data import import_df_local

In [35]:
df = import_df_local('data/01_analyze.csv')
X = df.drop('y', axis=1)
Y = df['y'] 

In [28]:
transform = [
    ('minmax_scaler', MinMaxScaler(), ['age', 'education', 'month', 'day_of_week', 'duration', 'campaign', 'previous']),
    ('onehot_encoder', OneHotEncoder(), ['job', 'marital', 'housing', 'loan', 'contact', 'poutcome'])
]
column_transformer = ColumnTransformer(transform)

In [29]:
transformed_data = column_transformer.fit_transform(df)

numeric_columns = ['age', 'education', 'month', 'day_of_week', 'duration', 'campaign', 'previous']
encoded_columns = column_transformer.named_transformers_['onehot_encoder'].get_feature_names_out(
    ['job', 'marital', 'housing', 'loan', 'contact', 'poutcome'])

transformed_df = pd.DataFrame(transformed_data, columns=numeric_columns + list(encoded_columns))

transformed_df.to_csv('data/02_processed.csv', index=False)
Y.to_csv('data/02a_Y.csv', index=False)

In [32]:
voting_clf = VotingClassifier(
    estimators=[
        ('svc', SVC(class_weight='balanced', kernel='poly', probability=True, random_state=42)),
        ('dtc', DecisionTreeClassifier(max_depth=6, class_weight='balanced', random_state=42)),
        ('gnb', GaussianNB(var_smoothing=1e-7))
    ]
)
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', voting_clf)
])
pipeline_part1 = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', SVC(probability=True, kernel='poly', class_weight='balanced', random_state=42))
])
pipeline_part2 = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', DecisionTreeClassifier(max_depth=6, class_weight='balanced', random_state=42))
])
pipeline_part3 = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', GaussianNB(var_smoothing=1e-7))
])

In [31]:
pip = pipeline.fit(X, Y)
pip1 = pipeline_part1.fit(X, Y)
pip2 = pipeline_part2.fit(X, Y)
pip3 = pipeline_part3.fit(X, Y)


joblib.dump(pip, 'models/main_model.pkl')
joblib.dump(pip1, 'models/pip1_model.pkl')
joblib.dump(pip2, 'models/pip2_model.pkl')
joblib.dump(pip3, 'models/pip3_model.pkl')

['models/pip3_model.pkl']