In [1]:
# Step 1: Import required libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score

# Step 2: Load the Titanic dataset
df = sns.load_dataset('titanic')

# Step 3: Drop unnecessary or mostly-empty columns
df = df.drop(columns=['deck', 'embark_town', 'alive'])

# Step 4: Handle missing values
df['age'] = df['age'].fillna(df['age'].mean())   # ✅ Fix: no inplace=True
df = df.dropna(subset=['embarked'])              # Drop rows where 'embarked' is missing
df = df.dropna()                                 # Drop any remaining missing data

# Step 5: Convert categorical columns to numeric using LabelEncoder
label_encoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object' or str(df[col].dtype) == 'category':
        df[col] = label_encoder.fit_transform(df[col].astype(str))  # ✅ Fix: convert to str

# Step 6: Normalize numerical columns using MinMaxScaler
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Step 7: Split the data into features (X) and target (y)
X = df.drop('survived', axis=1)  # Features
y = df['survived']               # Target

# Step 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 10: Predict and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n✅ Model Accuracy: {:.2f}%".format(accuracy * 100))



✅ Model Accuracy: 80.34%


In [16]:
# Step 1: Import necessary libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Step 2: Load dataset
df = sns.load_dataset('titanic')

# Step 3: Drop unnecessary or mostly empty columns
df = df.drop(columns=['deck', 'embark_town', 'alive'])

# Step 4: Handle missing values
df['age'] = df['age'].fillna(df['age'].mean())
df = df.dropna(subset=['embarked'])
df = df.dropna()

# Step 5: Encode categorical variables
label_encoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object' or str(df[col].dtype) == 'category':
        df[col] = label_encoder.fit_transform(df[col].astype(str))

# Step 6: Normalize numerical columns
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Step 7: Split into features and target
X = df.drop('survived', axis=1)
y = df['survived']

# Step 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Train Decision Tree Classifier
tree_model = DecisionTreeClassifier(max_depth=4, random_state=42)  # Limit depth for clear visualization
tree_model.fit(X_train, y_train)

# Step 10: Evaluate model
y_pred = tree_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Decision Tree Accuracy: {accuracy:.2f}")

## Step 11: Visualize the tree
#plt.figure(figsize=(20, 10))
#plot_tree(tree_model, feature_names=X.columns, class_names=['Not Survived', 'Survived'],
 #         filled=True, rounded=True, fontsize=12)
#plt.title("Decision Tree for Titanic Survival")
#plt.show()

# Step 12: Predict on a new sample (e.g. hypothetical passenger)
# NOTE: Input values must match the order and scaling of your features
# Example: [pclass, sex, age, sibsp, parch, fare, embarked, class, who, adult_male, alone]
# We'll use the first row from the training set as an example of a "new passenger"
# Fix: Create a new DataFrame with correct feature names
sample = pd.DataFrame([X.iloc[0].values], columns=X.columns)
prediction = tree_model.predict(sample)#

print("\n Prediction for new sample:", "Survived" if prediction[0] == 1 else "Not Survived")




 Decision Tree Accuracy: 0.81

 Prediction for new sample: Not Survived


In [None]:
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# ✅ Updated preprocessing function
def preprocess(df, target_column):
    df = df.copy()  # prevent chained assignment
    df = df.dropna()

    # Label encode object and category columns
    for col in df.columns:
        if df[col].dtype == 'object' or str(df[col].dtype) == 'category':
            df[col] = LabelEncoder().fit_transform(df[col].astype(str))

    # Separate features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # Scale numerical features
    scaler = MinMaxScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    return train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Decision tree pruning strategies
pruning_params = [
    {'max_depth': 3},
    {'max_depth': 5},
    {'min_samples_split': 10},
    {'min_samples_leaf': 5},
    {'max_leaf_nodes': 10}
]

# Datasets: (name, df, target_column)
datasets = []

# 1. Titanic
titanic = sns.load_dataset('titanic')
titanic = titanic.drop(columns=['deck', 'embark_town', 'alive'])
datasets.append(('Titanic', titanic, 'survived'))

# 2. Iris
iris = load_iris(as_frame=True)
datasets.append(('Iris', iris.frame, 'target'))

# 3. Wine
wine = load_wine(as_frame=True)
datasets.append(('Wine', wine.frame, 'target'))

# 4. Breast Cancer
cancer = load_breast_cancer(as_frame=True)
datasets.append(('Breast Cancer', cancer.frame, 'target'))

# 5. Penguins
penguins = sns.load_dataset('penguins')
penguins = penguins.drop(columns=['island', 'species'])
penguins = penguins[penguins['sex'].notna()]  # drop missing targets
penguins['sex'] = penguins['sex'].map({'Male': 1, 'Female': 0})
datasets.append(('Penguins', penguins, 'sex'))

# Main loop to collect results
results = []

for dataset_name, df, target in datasets:
    try:
        X_train, X_test, y_train, y_test = preprocess(df, target)

        for params in pruning_params:
            clf = DecisionTreeClassifier(**params, random_state=42)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            acc = accuracy_score(y_test, y_pred)

            results.append({
                'Dataset': dataset_name,
                'Pruning Method': str(params),
                'Accuracy': round(acc * 100, 2)
            })

    except Exception as e:
        print(f"❌ Error with dataset {dataset_name}: {e}")

# Create summary table
results_df = pd.DataFrame(results)

# Pivot for comparison
comparison_table = results_df.pivot(index='Dataset', columns='Pruning Method', values='Accuracy')
print("\n Decision Tree Pruning Comparison:\n")
print(comparison_table.round(2))

In [32]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Load the text dataset (selecting 4 categories for speed)
categories = ['sci.space', 'comp.graphics', 'rec.sport.baseball', 'talk.politics.misc']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# Define evaluation metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}

# -------------------------
#  Without Pruning
# -------------------------
tree_no_prune = DecisionTreeClassifier(random_state=42)
scores_no_prune = cross_validate(tree_no_prune, X, y, cv=10, scoring=scoring)

# -------------------------
#  With Pruning
# -------------------------
tree_pruned = DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
scores_pruned = cross_validate(tree_pruned, X, y, cv=10, scoring=scoring)

# -------------------------
# 📊 Compare Results
# -------------------------
def summarize_scores(scores):
    return {
        'Accuracy': round(scores['test_accuracy'].mean() * 100, 2),
        'Precision': round(scores['test_precision'].mean() * 100, 2),
        'Recall': round(scores['test_recall'].mean() * 100, 2),
        'F1 Score': round(scores['test_f1'].mean() * 100, 2)
    }

results = pd.DataFrame({
    'Without Pruning': summarize_scores(scores_no_prune),
    'With Pruning': summarize_scores(scores_pruned)
})

print(" Decision Tree Text Classification Results (10-fold CV):\n")
print(results)


 Decision Tree Text Classification Results (10-fold CV):

           Without Pruning  With Pruning
Accuracy             82.70         71.55
Precision            82.98         77.07
Recall               82.70         71.32
F1 Score             82.73         72.10


In [35]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import pandas as pd

# Load Iris dataset
iris = load_iris(as_frame=True)
X = iris.data
y = iris.target

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)
print(f"✅ Accuracy on Iris: {accuracy_score(y_test, y_pred):.2f}")

##Visualize One Tree from the Forest
#plt.figure(figsize=(20, 10))
#plot_tree(rf.estimators_[0],
 #         feature_names=iris.feature_names,
  #        class_names=iris.target_names,
   #       filled=True, rounded=True, max_depth=3)
#plt.title("🌳 Decision Tree from Iris Random Forest")
#plt.show()
##Predict New Sample
sample = pd.DataFrame([X_test.iloc[0].values], columns=X.columns)
prediction = rf.predict(sample)

print("🔮 Predicted species:", iris.target_names[prediction[0]])


✅ Accuracy on Iris: 1.00
🔮 Predicted species: versicolor


In [49]:
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load the student performance dataset
data = fetch_openml(name="student-performance-uci", version=1, as_frame=True)
df = data.frame

#print("Columns in dataset:", df.columns.tolist())

# Step: Decide how to form the target
# If G3 exists, use it. If G1, G2 also exist, we can drop them or keep them depending on your problem.
# We'll make a binary pass/fail: pass if final grade >= 10 (you can adjust threshold)

if 'G3' in df.columns:
    df['pass'] = (df['G3'].astype(float) >= 10).astype(int)
else:
    raise ValueError("G3 column not found, cannot define pass/fail target")

# Drop all grade columns to prevent leakage (if present)
for grade_col in ['G1', 'G2', 'G3']:
    if grade_col in df.columns:
        df = df.drop(columns=[grade_col])

# Encode categorical columns
for col in df.select_dtypes(include=['object', 'category']).columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Features and target
X = df.drop('pass', axis=1)
y = df['pass']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict & evaluate
y_pred = rf.predict(X_test)
print(" Random Forest Accuracy:", accuracy_score(y_test, y_pred))

# Visualize one tree
#plt.figure(figsize=(20, 10))
#plot_tree(rf.estimators_[0],
 #         feature_names=X.columns,
  #        class_names=['Fail', 'Pass'],
   #      filled=True, rounded=True, max_depth=3)
#plt.title("Sample Tree from Random Forest (Student pass/fail prediction)")
#plt.show()

# Predict a new sample example
# You need to construct a new sample with the same features (encoded) as X
# For demonstration, use first test sample:
sample = pd.DataFrame([X_test.iloc[0].values], columns=X.columns)
pred = rf.predict(sample)
print(" Prediction for sample:", "Pass" if pred[0] == 1 else "Fail")


 Random Forest Accuracy: 0.8769230769230769
 Prediction for sample: Pass
