In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix

In [None]:
data = pd.read_csv('bank_customers_train.csv')

In [None]:
summary = data.describe()

In [None]:
data.dropna(inplace=True)

In [None]:
data = pd.get_dummies(data, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome'])

In [None]:
X = data.drop(columns=['y'])
y = data['y']


In [None]:
y = y.map({'yes': 1, 'no': 0})

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
summary = X.describe()

In [None]:
for column in X.columns:
    sns.histplot(X[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
sns.pairplot(data, vars=['age', 'duration', 'campaign'], hue='y')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


In [None]:
y_pred_linear = linear_model.predict(X_test)
linear_r2 = r2_score(y_test, y_pred_linear)
linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_rmse = mean_squared_error(y_test, y_pred_linear, squared=False)

print("Linear Regression R-squared:", linear_r2)
print("Linear Regression MSE:", linear_mse)
print("Linear Regression RMSE:", linear_rmse)


In [None]:
coefficients = pd.DataFrame({'Variable': X.columns, 'Coefficient': linear_model.coef_})
print("Linear Regression Coefficients:")
print(coefficients)

In [None]:
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = train_test_split(X, y_binary, test_size=0.2, random_state=42)

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train_logistic, y_train_logistic)


In [None]:
y_pred_logistic = logistic_model.predict(X_test_logistic)

In [None]:
accuracy = accuracy_score(y_test_logistic, y_pred_logistic)
precision = precision_score(y_test_logistic, y_pred_logistic)
recall = recall_score(y_test_logistic, y_pred_logistic)
f1 = f1_score(y_test_logistic, y_pred_logistic)
roc_auc = roc_auc_score(y_test_logistic, y_pred_logistic)


In [None]:
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

In [None]:
fpr, tpr, _ = roc_curve(y_test_logistic, logistic_model.predict_proba(X_test_logistic)[:,1])
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


In [None]:
conf_matrix = confusion_matrix(y_test_logistic, y_pred_logistic)
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
coefficients = pd.DataFrame({'Variable': X.columns, 'Coefficient': logistic_model.coef_[0]})
print("\nLogistic Regression Coefficients:")
print(coefficients)