# Week 7 Solutions
## Machine Learning Problems - Comprehensive Analysis

## Question 1: California Housing Dataset - Model Comparison

Load the fetch_california_housing dataset, split the data with test_size=0.2 and random_state=1.
Train three models and compare R2 Score and RMSE.

In [3]:
# Import required libraries for Question 1
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Load the California Housing dataset
import os
import tarfile

# Check if we have the downloaded data
data_home = os.path.expanduser('~/scikit_learn_data')
archive_path = os.path.join(data_home, 'cal_housing.tgz')

try:
    # Try to load from sklearn's cache first
    california = fetch_california_housing()
    print("✓ Loaded from sklearn cache")
except:
    print("sklearn cache not available, using downloaded data...")
    
    # Extract and load the downloaded housing data
    if os.path.exists(archive_path):
        # Extract the CSV
        with tarfile.open(archive_path, 'r:gz') as tar:
            tar.extractall(path=data_home)
        
        # Load the CSV file
        csv_path = os.path.join(data_home, 'housing.csv')
        if os.path.exists(csv_path):
            import pandas as pd
            df = pd.read_csv(csv_path)
            
            print(f"Loaded CSV with shape: {df.shape}")
            
            # Process to match sklearn's California Housing format
            # sklearn format has these features in this order:
            # MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude
            
            # Handle missing values
            df = df.dropna()
            
            # Calculate derived features to match sklearn
            df['AveRooms'] = df['total_rooms'] / df['households']
            df['AveBedrms'] = df['total_bedrooms'] / df['households']
            df['AveOccup'] = df['population'] / df['households']
            
            # Create feature matrix in sklearn order
            feature_data = pd.DataFrame({
                'MedInc': df['median_income'],
                'HouseAge': df['housing_median_age'],
                'AveRooms': df['AveRooms'],
                'AveBedrms': df['AveBedrms'],
                'Population': df['population'],
                'AveOccup': df['AveOccup'],
                'Latitude': df['latitude'],
                'Longitude': df['longitude']
            })
            
            # Create target (scaled to match sklearn - in units of 100,000)
            target_data = df['median_house_value'] / 100000.0
            
            # Create a Bunch object to match sklearn's return format
            class Bunch:
                pass
            california = Bunch()
            california.data = feature_data.values
            california.target = target_data.values
            california.feature_names = list(feature_data.columns)
            
            print(f"✓ Processed California Housing data to match sklearn format")
        else:
            raise FileNotFoundError("Could not find housing.csv after extraction")
    else:
        raise FileNotFoundError(f"Archive not found at {archive_path}")

X = california.data
y = california.target

print(f"\n✓ Dataset ready!")
print(f"Dataset shape: X={X.shape}, y={y.shape}")
print(f"Features: {california.feature_names}")
print(f"Target range: {y.min():.2f} to {y.max():.2f}")


sklearn cache not available, using downloaded data...
Loaded CSV with shape: (20640, 10)
✓ Processed California Housing data to match sklearn format

✓ Dataset ready!
Dataset shape: X=(20433, 8), y=(20433,)
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Target range: 0.15 to 5.00


In [9]:
# Split the data with test_size=0.2 and random_state=1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 16346
Test set size: 4087


In [10]:
# Model 1: Linear Regression
print("Training Model 1: Linear Regression")
model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

r2_model1 = r2_score(y_test, y_pred1)
rmse_model1 = np.sqrt(mean_squared_error(y_test, y_pred1))

print(f"Model 1 - Linear Regression:")
print(f"  R2 Score: {r2_model1:.6f}")
print(f"  RMSE: {rmse_model1:.6f}")

Training Model 1: Linear Regression
Model 1 - Linear Regression:
  R2 Score: 0.599717
  RMSE: 0.729555


In [11]:
# Model 2: AdaBoost Regressor with random_state=1
print("\nTraining Model 2: AdaBoost Regressor")
model2 = AdaBoostRegressor(random_state=1)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

r2_model2 = r2_score(y_test, y_pred2)
rmse_model2 = np.sqrt(mean_squared_error(y_test, y_pred2))

print(f"Model 2 - AdaBoost Regressor:")
print(f"  R2 Score: {r2_model2:.6f}")
print(f"  RMSE: {rmse_model2:.6f}")


Training Model 2: AdaBoost Regressor
Model 2 - AdaBoost Regressor:
  R2 Score: 0.427368
  RMSE: 0.872595


In [12]:
# Model 3: Decision Tree Regressor with random_state=1
print("\nTraining Model 3: Decision Tree Regressor")
model3 = DecisionTreeRegressor(random_state=1)
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

r2_model3 = r2_score(y_test, y_pred3)
rmse_model3 = np.sqrt(mean_squared_error(y_test, y_pred3))

print(f"Model 3 - Decision Tree Regressor:")
print(f"  R2 Score: {r2_model3:.6f}")
print(f"  RMSE: {rmse_model3:.6f}")


Training Model 3: Decision Tree Regressor
Model 3 - Decision Tree Regressor:
  R2 Score: 0.581542
  RMSE: 0.745934


In [13]:
# Summary and Comparison
print("\n" + "="*60)
print("QUESTION 1 - SUMMARY AND COMPARISON")
print("="*60)

results_df = pd.DataFrame({
    'Model': ['Model 1: Linear Regression', 'Model 2: AdaBoost Regressor', 'Model 3: Decision Tree Regressor'],
    'R2 Score': [r2_model1, r2_model2, r2_model3],
    'RMSE': [rmse_model1, rmse_model2, rmse_model3]
})

print("\n", results_df.to_string(index=False))

# Sort by R2 Score (descending)
print("\n" + "-"*60)
print("R2 Score Ranking (Higher is Better):")
print("-"*60)
r2_sorted = results_df.sort_values('R2 Score', ascending=False)
for idx, row in r2_sorted.iterrows():
    print(f"{row['Model']}: {row['R2 Score']:.6f}")

# Sort by RMSE (ascending)
print("\n" + "-"*60)
print("RMSE Ranking (Lower is Better):")
print("-"*60)
rmse_sorted = results_df.sort_values('RMSE', ascending=True)
for idx, row in rmse_sorted.iterrows():
    print(f"{row['Model']}: {row['RMSE']:.6f}")

# Check all options
print("\n" + "="*60)
print("CHECKING OPTIONS:")
print("="*60)

# Option A: r2_score (Model 2) > r2_score (Model 1) > r2_score (Model 3)
option_a = (r2_model2 > r2_model1) and (r2_model1 > r2_model3)
print(f"(A) r2_score (Model 2) > r2_score (Model 1) > r2_score (Model 3): {option_a}")
print(f"    {r2_model2:.6f} > {r2_model1:.6f} > {r2_model3:.6f}")

# Option B: r2_score (Model 3) > r2_score (Model 1) > r2_score (Model 2)
option_b = (r2_model3 > r2_model1) and (r2_model1 > r2_model2)
print(f"\n(B) r2_score (Model 3) > r2_score (Model 1) > r2_score (Model 2): {option_b}")
print(f"    {r2_model3:.6f} > {r2_model1:.6f} > {r2_model2:.6f}")

# Option C: RMSE (Model 2) > RMSE (Model 1) > RMSE (Model 3)
option_c = (rmse_model2 > rmse_model1) and (rmse_model1 > rmse_model3)
print(f"\n(C) RMSE (Model 2) > RMSE (Model 1) > RMSE (Model 3): {option_c}")
print(f"    {rmse_model2:.6f} > {rmse_model1:.6f} > {rmse_model3:.6f}")

# Option D: RMSE (Model 3) > RMSE (Model 1) > RMSE (Model 2)
option_d = (rmse_model3 > rmse_model1) and (rmse_model1 > rmse_model2)
print(f"\n(D) RMSE (Model 3) > RMSE (Model 1) > RMSE (Model 2): {option_d}")
print(f"    {rmse_model3:.6f} > {rmse_model1:.6f} > {rmse_model2:.6f}")

# Determine correct answer
print("\n" + "="*60)
correct_options = []
if option_a:
    correct_options.append('A')
if option_b:
    correct_options.append('B')
if option_c:
    correct_options.append('C')
if option_d:
    correct_options.append('D')

print(f"CORRECT ANSWER(S) FOR QUESTION 1: {', '.join(correct_options)}")
print("="*60)


QUESTION 1 - SUMMARY AND COMPARISON

                            Model  R2 Score     RMSE
      Model 1: Linear Regression  0.599717 0.729555
     Model 2: AdaBoost Regressor  0.427368 0.872595
Model 3: Decision Tree Regressor  0.581542 0.745934

------------------------------------------------------------
R2 Score Ranking (Higher is Better):
------------------------------------------------------------
Model 1: Linear Regression: 0.599717
Model 3: Decision Tree Regressor: 0.581542
Model 2: AdaBoost Regressor: 0.427368

------------------------------------------------------------
RMSE Ranking (Lower is Better):
------------------------------------------------------------
Model 1: Linear Regression: 0.729555
Model 3: Decision Tree Regressor: 0.745934
Model 2: AdaBoost Regressor: 0.872595

CHECKING OPTIONS:
(A) r2_score (Model 2) > r2_score (Model 1) > r2_score (Model 3): False
    0.427368 > 0.599717 > 0.581542

(B) r2_score (Model 3) > r2_score (Model 1) > r2_score (Model 2): False
   

## Questions 2 & 3: Iris Dataset - Logistic Regression Classification

Load Iris dataset, split with test_size=0.33 and random_state=1.
Train Logistic Regression model and analyze results.

In [15]:
# Import required libraries for Questions 2 & 3
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [16]:
# Load the Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

print(f"Iris dataset shape: X={X_iris.shape}, y={y_iris.shape}")
print(f"Features: {iris.feature_names}")
print(f"Target classes: {iris.target_names}")
print(f"Class distribution: {np.bincount(y_iris)}")

Iris dataset shape: X=(150, 4), y=(150,)
Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target classes: ['setosa' 'versicolor' 'virginica']
Class distribution: [50 50 50]


In [17]:
# Split the data with test_size=0.33 and random_state=1
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.33, random_state=1
)

print(f"Training set size: {X_train_iris.shape[0]}")
print(f"Test set size: {X_test_iris.shape[0]}")

Training set size: 100
Test set size: 50


In [18]:
# Train Logistic Regression model with random_state=1
log_reg = LogisticRegression(random_state=1, max_iter=200)
log_reg.fit(X_train_iris, y_train_iris)

# Make predictions on test data
y_pred_iris = log_reg.predict(X_test_iris)

print("Logistic Regression model trained successfully")
print(f"Training score: {log_reg.score(X_train_iris, y_train_iris):.4f}")
print(f"Test score: {log_reg.score(X_test_iris, y_test_iris):.4f}")

Logistic Regression model trained successfully
Training score: 0.9800
Test score: 0.9800


In [19]:
# Print confusion matrix
cm = confusion_matrix(y_test_iris, y_pred_iris)
print("\n" + "="*60)
print("CONFUSION MATRIX:")
print("="*60)
print(cm)
print("\nConfusion Matrix (formatted):")
cm_df = pd.DataFrame(cm, 
                     index=['Actual 0', 'Actual 1', 'Actual 2'],
                     columns=['Predicted 0', 'Predicted 1', 'Predicted 2'])
print(cm_df)


CONFUSION MATRIX:
[[17  0  0]
 [ 0 18  1]
 [ 0  0 14]]

Confusion Matrix (formatted):
          Predicted 0  Predicted 1  Predicted 2
Actual 0           17            0            0
Actual 1            0           18            1
Actual 2            0            0           14


In [20]:
# Print classification report
print("\n" + "="*60)
print("CLASSIFICATION REPORT:")
print("="*60)
print(classification_report(y_test_iris, y_pred_iris, target_names=iris.target_names))


CLASSIFICATION REPORT:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        17
  versicolor       1.00      0.95      0.97        19
   virginica       0.93      1.00      0.97        14

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50



In [21]:
# Question 2: How many samples has the model misclassified?
print("\n" + "="*60)
print("QUESTION 2: NUMBER OF MISCLASSIFIED SAMPLES")
print("="*60)

# Misclassified samples are those where prediction != actual
misclassified = np.sum(y_pred_iris != y_test_iris)

# Also calculate from confusion matrix (sum of off-diagonal elements)
misclassified_from_cm = np.sum(cm) - np.trace(cm)

print(f"Total test samples: {len(y_test_iris)}")
print(f"Correctly classified: {np.sum(y_pred_iris == y_test_iris)}")
print(f"Misclassified samples: {misclassified}")
print(f"Misclassified (from confusion matrix): {misclassified_from_cm}")
print(f"\nAccuracy: {np.sum(y_pred_iris == y_test_iris) / len(y_test_iris) * 100:.2f}%")

print("\n" + "="*60)
print(f"ANSWER TO QUESTION 2: {misclassified}")
print("="*60)


QUESTION 2: NUMBER OF MISCLASSIFIED SAMPLES
Total test samples: 50
Correctly classified: 49
Misclassified samples: 1
Misclassified (from confusion matrix): 1

Accuracy: 98.00%

ANSWER TO QUESTION 2: 1


In [22]:
# Question 3: What is the recall for class 1?
print("\n" + "="*60)
print("QUESTION 3: RECALL FOR CLASS 1")
print("="*60)

# Extract recall from classification report
from sklearn.metrics import recall_score

# Calculate recall for each class
recall_class_0 = recall_score(y_test_iris, y_pred_iris, labels=[0], average=None)[0]
recall_class_1 = recall_score(y_test_iris, y_pred_iris, labels=[1], average=None)[0]
recall_class_2 = recall_score(y_test_iris, y_pred_iris, labels=[2], average=None)[0]

print(f"Recall for class 0: {recall_class_0:.4f}")
print(f"Recall for class 1: {recall_class_1:.4f}")
print(f"Recall for class 2: {recall_class_2:.4f}")

# Manual calculation for class 1 from confusion matrix
# Recall = True Positives / (True Positives + False Negatives)
# For class 1: TP = cm[1,1], FN = sum of row 1 except diagonal
tp_class1 = cm[1, 1]
fn_class1 = np.sum(cm[1, :]) - cm[1, 1]
recall_class1_manual = tp_class1 / (tp_class1 + fn_class1)

print(f"\nManual calculation for class 1:")
print(f"  True Positives (TP): {tp_class1}")
print(f"  False Negatives (FN): {fn_class1}")
print(f"  Recall = TP / (TP + FN) = {tp_class1} / {tp_class1 + fn_class1} = {recall_class1_manual:.4f}")

print("\n" + "="*60)
print(f"ANSWER TO QUESTION 3: {recall_class_1:.2f}")
print("="*60)


QUESTION 3: RECALL FOR CLASS 1
Recall for class 0: 1.0000
Recall for class 1: 0.9474
Recall for class 2: 1.0000

Manual calculation for class 1:
  True Positives (TP): 18
  False Negatives (FN): 1
  Recall = TP / (TP + FN) = 18 / 19 = 0.9474

ANSWER TO QUESTION 3: 0.95


## Questions 4 & 5: 20 Newsgroups Dataset - Text Classification

Load 20newsgroups train subset, vectorize with TfidfVectorizer, train MultinomialNB.

In [23]:
# Import required libraries for Questions 4 & 5
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [27]:
# Load the train subset of 20newsgroups with return_X_y=True
print("Loading 20newsgroups dataset...")

try:
    X_news, y_news = fetch_20newsgroups(subset='train', return_X_y=True, random_state=1)
    print("✓ Loaded from sklearn")
except:
    # If download fails, load directly from the manually downloaded cache
    print("Standard download failed, loading from local cache...")
    import pickle
    cache_path = os.path.expanduser('~/scikit_learn_data/20news_home/20news-bydate_py3.pkz')
    
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as f:
            cache = pickle.load(f)
        
        train_data = cache['train']
        X_news = train_data.data
        y_news = train_data.target
        print("✓ Loaded from local pickle cache")
    else:
        # Last resort: load directly from extracted files
        from sklearn.datasets._twenty_newsgroups import load_files
        train_path = os.path.expanduser('~/scikit_learn_data/20news-bydate-train')
        train_data = load_files(train_path, encoding='latin1', decode_error='ignore')
        X_news = train_data.data
        y_news = train_data.target
        print("✓ Loaded directly from extracted files")

print(f"\nDataset loaded successfully")
print(f"Number of documents: {len(X_news)}")
print(f"Number of categories: {len(np.unique(y_news))}")
print(f"Type of X: {type(X_news)}")
print(f"Type of y: {type(y_news)}")
print(f"Shape of y: {y_news.shape}")


Loading 20newsgroups dataset...
Standard download failed, loading from local cache...
✓ Loaded from local pickle cache

Dataset loaded successfully
Number of documents: 11314
Number of categories: 20
Type of X: <class 'list'>
Type of y: <class 'numpy.ndarray'>
Shape of y: (11314,)


In [28]:
# Vectorize X using TfidfVectorizer
print("\nVectorizing text data using TfidfVectorizer...")
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X_news)

print(f"\nVectorization complete")
print(f"Type of vectorized data: {type(X_vectorized)}")
print(f"Shape of vectorized data: {X_vectorized.shape}")


Vectorizing text data using TfidfVectorizer...

Vectorization complete
Type of vectorized data: <class 'scipy.sparse._csr.csr_matrix'>
Shape of vectorized data: (11314, 130107)


In [29]:
# Question 4: Shape of the fitted and transformed data
print("\n" + "="*60)
print("QUESTION 4: SHAPE OF FITTED AND TRANSFORMED DATA")
print("="*60)

shape_X = X_vectorized.shape
print(f"\nShape of vectorized X: {shape_X}")
print(f"Number of documents: {shape_X[0]}")
print(f"Number of features (vocabulary size): {shape_X[1]}")

# Check which option matches
print("\n" + "-"*60)
print("CHECKING OPTIONS:")
print("-"*60)

options = {
    'A': (11310, 130507),
    'B': (21314, 190807),
    'C': (11514, 160107),
    'D': (11314, 130107)
}

for option, expected_shape in options.items():
    match = (shape_X == expected_shape)
    print(f"({option}) {expected_shape}: {match}")

print("\n" + "="*60)
for option, expected_shape in options.items():
    if shape_X == expected_shape:
        print(f"ANSWER TO QUESTION 4: ({option}) {expected_shape}")
        break
else:
    print(f"ANSWER TO QUESTION 4: Actual shape is {shape_X}")
print("="*60)


QUESTION 4: SHAPE OF FITTED AND TRANSFORMED DATA

Shape of vectorized X: (11314, 130107)
Number of documents: 11314
Number of features (vocabulary size): 130107

------------------------------------------------------------
CHECKING OPTIONS:
------------------------------------------------------------
(A) (11310, 130507): False
(B) (21314, 190807): False
(C) (11514, 160107): False
(D) (11314, 130107): True

ANSWER TO QUESTION 4: (D) (11314, 130107)


In [30]:
# Split vectorized data with test_size=0.3 and random_state=1
print("\nSplitting data into train and validation sets...")
X_train_news, X_val_news, y_train_news, y_val_news = train_test_split(
    X_vectorized, y_news, test_size=0.3, random_state=1
)

print(f"Training set size: {X_train_news.shape[0]}")
print(f"Validation set size: {X_val_news.shape[0]}")
print(f"Training set shape: {X_train_news.shape}")
print(f"Validation set shape: {X_val_news.shape}")


Splitting data into train and validation sets...
Training set size: 7919
Validation set size: 3395
Training set shape: (7919, 130107)
Validation set shape: (3395, 130107)


In [31]:
# Question 5: Train MultinomialNB and compute validation score
print("\n" + "="*60)
print("QUESTION 5: MULTINOMIAL NB VALIDATION SCORE")
print("="*60)

print("\nTraining MultinomialNB model...")
mnb = MultinomialNB()
mnb.fit(X_train_news, y_train_news)

# Compute score on validation set
train_score = mnb.score(X_train_news, y_train_news)
val_score = mnb.score(X_val_news, y_val_news)

print(f"\nModel training complete")
print(f"Training score: {train_score:.6f}")
print(f"Validation score: {val_score:.6f}")

# Make predictions for additional analysis
y_pred_news = mnb.predict(X_val_news)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val_news, y_pred_news)
print(f"Validation accuracy (verification): {accuracy:.6f}")

print("\n" + "="*60)
print(f"ANSWER TO QUESTION 5: {val_score:.6f}")
print(f"Validation Score (more precision): {val_score}")
print("="*60)


QUESTION 5: MULTINOMIAL NB VALIDATION SCORE

Training MultinomialNB model...

Model training complete
Training score: 0.936482
Validation score: 0.840648
Validation accuracy (verification): 0.840648

ANSWER TO QUESTION 5: 0.840648
Validation Score (more precision): 0.8406480117820324


## Final Summary of All Answers

In [32]:
# Print final summary of all answers
print("\n" + "#"*70)
print("#" + " "*68 + "#")
print("#" + " "*20 + "FINAL ANSWERS SUMMARY" + " "*27 + "#")
print("#" + " "*68 + "#")
print("#"*70)

print("\n" + "="*70)
print("QUESTION 1: California Housing - Model Comparison")
print("="*70)
print(f"Model 1 (Linear Regression)    - R2: {r2_model1:.6f}, RMSE: {rmse_model1:.6f}")
print(f"Model 2 (AdaBoost Regressor)   - R2: {r2_model2:.6f}, RMSE: {rmse_model2:.6f}")
print(f"Model 3 (Decision Tree)        - R2: {r2_model3:.6f}, RMSE: {rmse_model3:.6f}")
print("\nCorrect option(s):")
for opt in correct_options:
    print(f"  ✓ Option {opt}")

print("\n" + "="*70)
print("QUESTION 2: Iris Dataset - Misclassified Samples")
print("="*70)
print(f"Answer: {misclassified} samples misclassified")

print("\n" + "="*70)
print("QUESTION 3: Iris Dataset - Recall for Class 1")
print("="*70)
print(f"Answer: {recall_class_1:.2f}")

print("\n" + "="*70)
print("QUESTION 4: 20 Newsgroups - Shape of Vectorized Data")
print("="*70)
print(f"Answer: {shape_X}")
for option, expected_shape in options.items():
    if shape_X == expected_shape:
        print(f"  ✓ Option {option}: {expected_shape}")

print("\n" + "="*70)
print("QUESTION 5: 20 Newsgroups - MultinomialNB Validation Score")
print("="*70)
print(f"Answer: {val_score:.6f}")
print(f"Full precision: {val_score}")

print("\n" + "#"*70)
print("#" + " "*22 + "END OF ANALYSIS" + " "*31 + "#")
print("#"*70)


######################################################################
#                                                                    #
#                    FINAL ANSWERS SUMMARY                           #
#                                                                    #
######################################################################

QUESTION 1: California Housing - Model Comparison
Model 1 (Linear Regression)    - R2: 0.599717, RMSE: 0.729555
Model 2 (AdaBoost Regressor)   - R2: 0.427368, RMSE: 0.872595
Model 3 (Decision Tree)        - R2: 0.581542, RMSE: 0.745934

Correct option(s):

QUESTION 2: Iris Dataset - Misclassified Samples
Answer: 1 samples misclassified

QUESTION 3: Iris Dataset - Recall for Class 1
Answer: 0.95

QUESTION 4: 20 Newsgroups - Shape of Vectorized Data
Answer: (11314, 130107)
  ✓ Option D: (11314, 130107)

QUESTION 5: 20 Newsgroups - MultinomialNB Validation Score
Answer: 0.840648
Full precision: 0.8406480117820324

########################