<a href="https://colab.research.google.com/github/sebmatecho/Bitcoin_tracker/blob/master/notebooks/model_adjustment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Adjustment

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
dataframe = pd.read_csv('bank-full.csv', sep = ';')

Stuff to carry out:
- job, marital, education, contact to be one-hot-encoded
- balance, duration: trim outliers
- pdays to be converted into binary (previously reached or not)
- day and month to be represnted by cosine and sine?
- poutcome?

In [3]:
# trimming extreme values (outliers)
dataframe = dataframe[(dataframe['balance']<10000) & (dataframe['duration']<1800)]

In [4]:
dataframe['pdays'] = dataframe['pdays'].apply(lambda value: 0 if value==-1 else 1 )

In [5]:
dataframe['default'] = dataframe['default'].apply(lambda value: 1 if value =='yes' else 0)

In [6]:
dataframe['housing'] = dataframe['housing'].apply(lambda value: 1 if value =='yes' else 0)

In [7]:
dataframe['loan'] = dataframe['loan'].apply(lambda value: 1 if value =='yes' else 0)

In [8]:
dataframe['y']= dataframe['y'].apply(lambda value: 1 if value =='yes' else 0)

In [9]:
dataframe['month'] = dataframe['month'].replace({'may':5,
                            'jun':6,
                            'jul':7,
                            'aug':8,
                            'oct':10,
                            'nov':11,
                            'dec':12,
                            'jan':1,
                            'feb':2,
                            'mar':3,
                            'apr':4,
                            'sep':9})

  dataframe['month'] = dataframe['month'].replace({'may':5,


In [10]:
import numpy as np

dataframe['day_sine'] = dataframe['day'].apply(lambda value: np.sin(2*value*np.pi/31))
dataframe['day_cosine'] = dataframe['day'].apply(lambda value: np.cos(2*value*np.pi/31))

dataframe['month_sine'] = dataframe['month'].apply(lambda value: np.sin(2*value*np.pi/12))
dataframe['month_cosine'] = dataframe['month'].apply(lambda value: np.cos(2*value*np.pi/12))

In [11]:
dataframe = dataframe.drop(['day', 'month'], axis = 1)

In [12]:
X = dataframe.drop('y', axis = 1)
y = dataframe['y']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 123)

In [13]:
def preprocessing_pipeline(dataframe):
  numeric_features = ['age','balance','duration', 'campaign','previous']
  time_features = ['day_sine', 'day_cosine', 'month_sine', 'month_cosine']
  categorical_features = ['job', 'marital', 'education', 'contact', 'poutcome']


  preprocessor = ColumnTransformer(transformers=[
      ('num', StandardScaler(), numeric_features),
      ('cat', OneHotEncoder(), categorical_features)
  ])

  dataframe_transformed = preprocessor.fit_transform(dataframe)

  ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
  final_columns = numeric_features + list(ohe_feature_names)
  X_train_preprocessed_df = pd.DataFrame(dataframe_transformed, columns=final_columns)

  X_train_preprocessed_df = pd.concat([X_train_preprocessed_df,dataframe[time_features].reset_index(drop = True)], axis = 1)

  return X_train_preprocessed_df

In [14]:
X_train_processed = preprocessing_pipeline(X_train)
X_test_processed = preprocessing_pipeline(X_test)



In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

# model = BernoulliNB()
# model = KNeighborsClassifier()
# model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
# model = RandomForestClassifier()
model = XGBClassifier(objective="binary:logistic",
                      # enable_categorical=True,
                      use_label_encoder=False,
                      eval_metric="logloss",
                      random_state=42)
# model = LogisticRegression()
# model = svm.SVC()
model.fit(X_train_processed, y_train)
y_hat = model.predict(X_test_processed)
accuracy_score(y_test, y_hat)

Parameters: { "use_label_encoder" } are not used.



0.9650681321990514

In [None]:


# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.1, 0.01, 0.001],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 1, 5],
    "reg_alpha": [0, 1, 10],
    "reg_lambda": [0, 1, 10],
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    scoring = "roc_auc",
    cv = 5,
    verbose = 1,
    n_jobs = -1
)

# Fit the grid search
final = grid_search.fit(X_train_processed, y_train)
y_test = final.predict(X_test_processed)
print(accuracy_score(y_test, y_hat))

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
# itting 5 folds for each of 8748 candidates, totalling 43740 fits

Fitting 5 folds for each of 8748 candidates, totalling 43740 fits


In [None]:
confusion_matrix(y_test, y_hat)