In [4]:
import pandas as pd

data = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Drop missing values
data = data.dropna()

# Features to include in the model
features = ['age', 'job', 'marital', 'education', 'loan', 'campaign', 'nr.employed']

# Encode categorical features using one-hot encoding
X = pd.get_dummies(data[features], drop_first=True)

# Encode the target variable (yes=1, no=0)
y = data['y'].map({'yes': 1, 'no': 0})

# Train-test split (20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, stratify=y)

# Build the decision tree with class weights
clf = DecisionTreeClassifier(random_state=0, class_weight='balanced')

# Train the model
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")

# Classification Report
report = classification_report(y_test, y_pred, target_names=['No', 'Yes'])
print("\nClassification Report:")
print(report)


Model Accuracy: 0.79

Classification Report:
              precision    recall  f1-score   support

          No       0.91      0.85      0.88      6572
         Yes       0.23      0.35      0.28       842

    accuracy                           0.79      7414
   macro avg       0.57      0.60      0.58      7414
weighted avg       0.83      0.79      0.81      7414



In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Drop missing values
data = data.dropna()

# Features to include in the model
features = ['age', 'job', 'marital', 'education', 'loan', 'campaign', 'nr.employed']

# Encode categorical features using one-hot encoding
X = pd.get_dummies(data[features], drop_first=True)

# Encode the target variable (yes=1, no=0)
y = data['y'].map({'yes': 1, 'no': 0})

# Train-test split (20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, stratify=y)

# Build the Random Forest classifier with limited depth
clf = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced', max_depth=10)  # Set max_depth to 10

# Train the model
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")

# Classification Report
report = classification_report(y_test, y_pred, target_names=['No', 'Yes'])
print("\nClassification Report:")
print(report)


Model Accuracy: 0.82

Classification Report:
              precision    recall  f1-score   support

          No       0.94      0.85      0.89      6572
         Yes       0.33      0.59      0.42       842

    accuracy                           0.82      7414
   macro avg       0.63      0.72      0.66      7414
weighted avg       0.87      0.82      0.84      7414



In [23]:


test_data = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test_mini.csv")

# Align features with the training data
X_test_holdout = pd.get_dummies(test_data[features], drop_first=True)

# Make predictions on the holdout dataset
predictions = clf.predict(X_test_holdout)

# Save predictions to a CSV file
output = pd.DataFrame({'y': predictions})

# Save to CSV
output.to_csv("team08-module2-predictions.csv", index=False)

print("Predictions saved to 'team08-module2-predictions.csv'")


Predictions saved to 'team08-module2-predictions.csv'
