In [7]:
#sheet4 Q4:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from operator import itemgetter


RANDOM_STATE = 42
TEST_SIZE = 0.2

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
feature_names = cancer.feature_names


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

results = {}


print("Train and Evaluate Models")


dt_model = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_model.fit(X_train, y_train)
results['Decision Tree'] = {
    'Train Acc': dt_model.score(X_train, y_train),
    'Test Acc': dt_model.score(X_test, y_test)
}


rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf_model.fit(X_train, y_train)
results['Random Forest'] = {
    'Train Acc': rf_model.score(X_train, y_train),
    'Test Acc': rf_model.score(X_test, y_test)
}


gb_model = GradientBoostingClassifier(random_state=RANDOM_STATE)
gb_model.fit(X_train, y_train)
results['Gradient Boosting'] = {
    'Train Acc': gb_model.score(X_train, y_train),
    'Test Acc': gb_model.score(X_test, y_test)
}


print("\n Model Performance Comparison:")
print("| Model                    | Train Accuracy | Test Accuracy |")
for model_name, accs in results.items():
    print(f"| {model_name.ljust(24)} | {accs['Train Acc']:.4f}       | {accs['Test Acc']:.4f}        |")

print("\n Top 5 Feature Importances")


importances_rf = rf_model.feature_importances_
feature_importance_rf = sorted(zip(feature_names, importances_rf), key=itemgetter(1), reverse=True)

print("\n Top 5 features for [Random Forest]:")
for name, importance in feature_importance_rf[:5]:
    print(f"- {name.ljust(25)}: {importance:.4f}")

importances_gb = gb_model.feature_importances_
feature_importance_gb = sorted(zip(feature_names, importances_gb), key=itemgetter(1), reverse=True)

print("\n Top 5 features for [Gradient Boosting]:")
for name, importance in feature_importance_gb[:5]:
    print(f"- {name.ljust(25)}: {importance:.4f}")


print("\n Final Comparison and Conclusion")


best_test_acc = max(results[m]['Test Acc'] for m in results)
best_model = [m for m, accs in results.items() if accs['Test Acc'] == best_test_acc][0]

print(f" The model with the best test accuracy is: [{best_model}] ({best_test_acc:.4f})")

top5_rf_names = [name for name, _ in feature_importance_rf[:5]]
top5_gb_names = [name for name, _ in feature_importance_gb[:5]]
intersection = set(top5_rf_names).intersection(top5_gb_names)

Train and Evaluate Models

 Model Performance Comparison:
| Model                    | Train Accuracy | Test Accuracy |
| Decision Tree            | 1.0000       | 0.9474        |
| Random Forest            | 1.0000       | 0.9649        |
| Gradient Boosting        | 1.0000       | 0.9561        |

 Top 5 Feature Importances

 Top 5 features for [Random Forest]:
- worst area               : 0.1539
- worst concave points     : 0.1447
- mean concave points      : 0.1062
- worst radius             : 0.0780
- mean concavity           : 0.0680

 Top 5 features for [Gradient Boosting]:
- mean concave points      : 0.4505
- worst concave points     : 0.2401
- worst radius             : 0.0756
- worst perimeter          : 0.0514
- worst texture            : 0.0399

 Final Comparison and Conclusion
 The model with the best test accuracy is: [Random Forest] (0.9649)


In [8]:
#assignment sheet4:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from operator import itemgetter

RANDOM_STATE = 42
TEST_SIZE = 0.2

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
feature_names = cancer.feature_names

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
results = {}

print("Decision Tree (Full and Pruned) & Overfitting Comparison")

dt_full = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_full.fit(X_train, y_train)
results['Decision Tree (Full)'] = {
    'Train Acc': dt_full.score(X_train, y_train),
    'Test Acc': dt_full.score(X_test, y_test)
}

dt_pruned = DecisionTreeClassifier(max_depth=3, random_state=RANDOM_STATE)
dt_pruned.fit(X_train, y_train)
results['Decision Tree (Pruned)'] = {
    'Train Acc': dt_pruned.score(X_train, y_train),
    'Test Acc': dt_pruned.score(X_test, y_test)
}

print("| Model                    | Train Accuracy | Test Accuracy |")
print(f"| {'Decision Tree (Full)'.ljust(24)} | {results['Decision Tree (Full)']['Train Acc']:.4f}       | {results['Decision Tree (Full)']['Test Acc']:.4f}        |")
print(f"| {'Decision Tree (Pruned)'.ljust(24)} | {results['Decision Tree (Pruned)']['Train Acc']:.4f}       | {results['Decision Tree (Pruned)']['Test Acc']:.4f}        |")

print("Overfitting Comment:")
print(f" DT (Full): High overfitting (Train Acc: {results['Decision Tree (Full)']['Train Acc']:.4f} vs. Test Acc: {results['Decision Tree (Full)']['Test Acc']:.4f})")
print(f" DT (Pruned): Reduced overfitting, better generalization (Train Acc: {results['Decision Tree (Pruned)']['Train Acc']:.4f} vs. Test Acc: {results['Decision Tree (Pruned)']['Test Acc']:.4f})")

print("Train Random Forest and Compare with Decision Trees")

rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf_model.fit(X_train, y_train)
results['Random Forest (100)'] = {
    'Train Acc': rf_model.score(X_train, y_train),
    'Test Acc': rf_model.score(X_test, y_test)
}

print("| Model                    | Train Accuracy | Test Accuracy |")
print(f"| {'Random Forest (100)'.ljust(24)} | {results['Random Forest (100)']['Train Acc']:.4f}       | {results['Random Forest (100)']['Test Acc']:.4f}        |")
print(f"| {'Decision Tree (Pruned)'.ljust(24)} | {results['Decision Tree (Pruned)']['Train Acc']:.4f}       | {results['Decision Tree (Pruned)']['Test Acc']:.4f}        |")
print("Comparison:")

learning_rates = [0.01, 0.1]
n_estimators_list = [50, 100, 200]
gb_tuning_results = []

print("| Learning Rate | N_Estimators | Train Accuracy | Test Accuracy |")


for lr in learning_rates:
    for n_est in n_estimators_list:
        gb_model = GradientBoostingClassifier(n_estimators=n_est, learning_rate=lr, random_state=RANDOM_STATE)
        gb_model.fit(X_train, y_train)

        train_acc = gb_model.score(X_train, y_train)
        test_acc = gb_model.score(X_test, y_test)

        gb_tuning_results.append({
            'lr': lr,
            'n_est': n_est,
            'Train Acc': train_acc,
            'Test Acc': test_acc
        })

        print(f"| {lr:.2f}          | {n_est:<12} | {train_acc:.4f}         | {test_acc:.4f}        |")

gb_final = GradientBoostingClassifier(random_state=RANDOM_STATE).fit(X_train, y_train)
rf_final = rf_model

print("Top 5 Feature Importances")

importances_rf = rf_final.feature_importances_
feature_importance_rf = sorted(zip(feature_names, importances_rf), key=itemgetter(1), reverse=True)

print("\n Top 5 features for Random Forest:")
for name, importance in feature_importance_rf[:5]:
    print(f"- {name.ljust(25)}: {importance:.4f}")

importances_gb = gb_final.feature_importances_
feature_importance_gb = sorted(zip(feature_names, importances_gb), key=itemgetter(1), reverse=True)

print("\n Top 5 features for Gradient Boosting (Default):")
for name, importance in feature_importance_gb[:5]:
    print(f"- {name.ljust(25)}: {importance:.4f}")

Decision Tree (Full and Pruned) & Overfitting Comparison
| Model                    | Train Accuracy | Test Accuracy |
| Decision Tree (Full)     | 1.0000       | 0.9474        |
| Decision Tree (Pruned)   | 0.9780       | 0.9474        |
Overfitting Comment:
 DT (Full): High overfitting (Train Acc: 1.0000 vs. Test Acc: 0.9474)
 DT (Pruned): Reduced overfitting, better generalization (Train Acc: 0.9780 vs. Test Acc: 0.9474)
Train Random Forest and Compare with Decision Trees
| Model                    | Train Accuracy | Test Accuracy |
| Random Forest (100)      | 1.0000       | 0.9649        |
| Decision Tree (Pruned)   | 0.9780       | 0.9474        |
Comparison:
| Learning Rate | N_Estimators | Train Accuracy | Test Accuracy |
| 0.01          | 50           | 0.9780         | 0.9561        |
| 0.01          | 100          | 0.9868         | 0.9561        |
| 0.01          | 200          | 0.9934         | 0.9561        |
| 0.10          | 50           | 1.0000         | 0.9561      

In [9]:
#assignment sheet3 Q1:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

cancer = load_breast_cancer(as_frame=True)
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic_regression', LogisticRegression(random_state=42, solver='liblinear'))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Model Performance Evaluation")
print(f"* Accuracy: {accuracy:.4f}")
print(f"* Precision: {precision:.4f}")
print(f"* Recall (Sensitivity): {recall:.4f}")
print(f"* F1-Score: {f1:.4f}")

Model Performance Evaluation
* Accuracy: 0.9737
* Precision: 0.9722
* Recall (Sensitivity): 0.9859
* F1-Score: 0.9790


In [10]:
#assignment sheet3 Q2:

coefficients = pipeline['logistic_regression'].coef_[0]
feature_names = X.columns

feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})


feature_importance['Absolute_Coefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Absolute_Coefficient', ascending=False)

print(feature_importance[['Feature', 'Coefficient']].head(10).to_string(index=False))

             Feature  Coefficient
       worst texture    -1.335651
        radius error    -1.283117
      worst symmetry    -1.196087
 mean concave points    -1.130510
          area error    -0.944861
     worst concavity    -0.942150
          worst area    -0.882949
        worst radius    -0.881042
      mean concavity    -0.818323
worst concave points    -0.766904
