In [33]:
import numpy as np
import pandas as pd

In [34]:
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.kernel_approximation import RBFSampler
from scipy.stats import randint, uniform

In [3]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [4]:
data['job'].value_counts()

management       175541
blue-collar      170498
technician       138107
admin.            81492
services          64209
retired           35185
self-employed     19020
entrepreneur      17718
unemployed        17634
housemaid         15912
student           11767
unknown            2917
Name: job, dtype: int64

In [5]:
X = data.iloc[:, 1:-1]
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown
1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown
2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown
3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown
4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown


In [6]:
y = data['y']

In [7]:
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_include='object')

num_pipes = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipes = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('oneh', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
])

ctx = ColumnTransformer([
    ('num', num_pipes, num_cols),
    ('cat', cat_pipes, cat_cols),
])

In [8]:
X = ctx.fit_transform(X)

In [9]:
print(X)
print(X.shape)

[[ 0.10630996 -0.42208299  1.076594   ...  0.          0.
   1.        ]
 [-0.28977584 -0.24331605  0.22819418 ...  0.          0.
   1.        ]
 [-0.48781874 -0.21228747 -0.25660572 ...  0.          0.
   1.        ]
 ...
 [ 0.89848155 -0.34803751  0.10699421 ...  0.          0.
   1.        ]
 [-0.88390453 -0.52116289  1.19779398 ...  0.          0.
   1.        ]
 [ 0.10630996  0.12514835 -1.46860546 ...  0.          0.
   0.        ]]
(750000, 51)


In [12]:

models = {
    'xgb': XGBClassifier(),
    #'svc': SVC(kernel='rbf'),
    'rfc': RandomForestClassifier(n_estimators=300, max_depth=50),
    'nv': GaussianNB(),
    'logR': LogisticRegression()
}

max_score = 0
selected_model = False

for name,model in models.items():
    score = cross_val_score(estimator=model, X = X, y = y, cv = 3, scoring = 'accuracy', verbose=2)
    mean_score = score.mean()
    print(f'Model {name}: mean cv score: {mean_score}\n')
    if mean_score > max_score:
        max_score = mean_score
        selected_model = model

print(f'Selected model based on cv score: {selected_model}, max score: {max_score}\n')

[CV] END .................................................... total time=   3.8s
[CV] END .................................................... total time=   3.8s
[CV] END .................................................... total time=   4.8s
Model xgb: mean cv score: 0.9343493333333334

[CV] END .................................................... total time= 6.9min
[CV] END .................................................... total time= 7.7min
[CV] END .................................................... total time= 7.5min
Model rfc: mean cv score: 0.93216

[CV] END .................................................... total time=   0.8s
[CV] END .................................................... total time=   0.8s
[CV] END .................................................... total time=   0.9s
Model nv: mean cv score: 0.85678

[CV] END .................................................... total time=   5.6s
[CV] END .................................................... total time=  

In [18]:
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.01, 0.3),  # from 0.01 to 0.31
    'subsample': uniform(0.5, 0.5),       # from 0.5 to 1.0
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 5),
    'min_child_weight': randint(1, 10),
    'scale_pos_weight': [1, 2, 3],        # useful if classes are imbalanced
}

search = RandomizedSearchCV(
    estimator=XGBClassifier(),
    param_distributions=param_dist,
    n_iter=25,                 # Number of different combinations to try
    scoring='accuracy',        # Or 'roc_auc' for imbalanced binary classification
    cv=3,                      # 3-fold cross-validation
    verbose=2,
    n_jobs=-1,                 # Use all cores
    random_state=42
)

search.fit(X, y)

print("Best parameters:", search.best_params_)
print("Best score:", search.best_score_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best parameters: {'colsample_bytree': 0.6347061668992607, 'gamma': 1.220627611238871, 'learning_rate': 0.06048731265187917, 'max_depth': 10, 'min_child_weight': 9, 'n_estimators': 791, 'scale_pos_weight': 1, 'subsample': 0.5324461235544908}
Best score: 0.9364866666666667


In [19]:
xgb = XGBClassifier(colsample_bytree= 0.6347061668992607, gamma=1.220627611238871, learning_rate=0.06048731265187917, max_depth=10, min_child_weight=9, n_estimators=791, scale_pos_weight=1, subsample=0.5324461235544908)
xgb.fit(X,y)

In [25]:
data_test = pd.read_csv('test.csv')
X_test = data_test.iloc[:, 1:]
X_test = ctx.transform(X_test)
y_pred = xgb.predict_proba(X_test)

In [26]:
print(y_pred)

[[9.9863452e-01 1.3654663e-03]
 [8.8275272e-01 1.1724726e-01]
 [9.9980372e-01 1.9625899e-04]
 ...
 [1.8939614e-01 8.1060386e-01]
 [9.9908412e-01 9.1588340e-04]
 [8.8639498e-01 1.1360499e-01]]


In [32]:
np.savetxt('output.csv', np.column_stack((data_test['id'], y_pred[:,1])), delimiter=',', comments='', header='id,y', fmt=['%d','%.7f'])