# Semester 3 Coding Portfolio Topic 4 Formative Part 2/2:
## Evaluating Logistic Regression Predictions

This notebook covers the following topics:
 - logistic regression

This notebook is expected to take around 5 hours to complete.

<b>Formative section</b><br>
Simply complete the given functions such that they pass the automated tests. This part is graded Pass/Fail; you must get 100% correct!
You can submit your notebook through Canvas as often as you like. Make sure to start doing so early to ensure that your code passes all tests!
You may ask for help from fellow students and TAs on this section, and solutions might be provided later on.

In [1]:
# Import Necessary Libraries
import sys
import math
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import BinaryResultsWrapper
import sklearn
import scipy
from scipy.stats import multivariate_normal
from scipy.special import expit as logistic_sigmoid
from packaging import version
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
from sklearn.metrics import balanced_accuracy_score, brier_score_loss, accuracy_score, roc_curve, auc
from sklearn.model_selection import KFold

In [None]:
# Install all necessary packages to avoid kernel restart
# This ensures all dependencies are available before running the notebook
# IMPORTANT: This cell must be run AFTER cell 1 (imports) to reload modules with new versions
import subprocess
import sys
import importlib

# List of packages to install/upgrade with minimum versions
# Format: (package_name, min_version, is_required)
# is_required: True = critical, False = can skip if installation fails
# Note: If a version isn't available, we'll install the latest and continue
packages = [
    ('numpy', '2.3.4', True),  # Critical - May not be available, will use latest (2.0.2)
    ('pandas', '2.3.3', True),  # Critical
    ('matplotlib', '3.10', True),  # Critical for plotting
    ('seaborn', '0.13', False),  # Nice to have for better plots, but not critical
    ('statsmodels', '0.14', True),  # Critical for logistic regression
    ('scikit-learn', '1.7', True),  # Critical for metrics and cross-validation
    ('scipy', '1.16', True),  # Critical for statistical functions
    ('packaging', '0.0', False)  # Only needed for version checking, not critical
]

print("Installing/upgrading packages to required versions...")
print("=" * 60)

# Install packages using pip with --upgrade to ensure correct versions
installed_versions = {}
failed_packages = []

for package_name, min_version, is_required in packages:
    try:
        # First try to install with the minimum version requirement
        package_spec = f"{package_name}>={min_version}"
        result = subprocess.run(
            [sys.executable, '-m', 'pip', 'install', '--upgrade', package_spec],
            capture_output=True,
            text=True,
            timeout=300  # 5 minute timeout per package
        )
        
        if result.returncode == 0:
            # Installation succeeded - now check what version was actually installed
            check_result = subprocess.run(
                [sys.executable, '-m', 'pip', 'show', package_name],
                capture_output=True,
                text=True
            )
            if check_result.returncode == 0:
                # Extract version from pip show output
                for line in check_result.stdout.split('\n'):
                    if line.startswith('Version:'):
                        installed_version = line.split(':', 1)[1].strip()
                        installed_versions[package_name] = installed_version
                        
                        # Check if installed version meets requirement
                        from packaging import version as pkg_version
                        try:
                            if pkg_version.parse(installed_version) >= pkg_version.parse(min_version):
                                print(f"âœ“ {package_name} {installed_version} (meets requirement >= {min_version})")
                            else:
                                print(f"âš  {package_name} {installed_version} (latest available, but < {min_version})")
                                print(f"  Note: Version {min_version} may not be available. Latest is {installed_version}")
                        except:
                            print(f"âœ“ {package_name} {installed_version} installed")
                        break
            else:
                print(f"âœ“ {package_name} installed/updated")
        else:
            # If version-specific install failed, try installing latest
            print(f"âš  {package_name}>={min_version} not available, trying latest version...")
            result2 = subprocess.run(
                [sys.executable, '-m', 'pip', 'install', '--upgrade', package_name],
                capture_output=True,
                text=True,
                timeout=300
            )
            if result2.returncode == 0:
                check_result = subprocess.run(
                    [sys.executable, '-m', 'pip', 'show', package_name],
                    capture_output=True,
                    text=True
                )
                if check_result.returncode == 0:
                    for line in check_result.stdout.split('\n'):
                        if line.startswith('Version:'):
                            installed_version = line.split(':', 1)[1].strip()
                            installed_versions[package_name] = installed_version
                            print(f"âš  {package_name} {installed_version} (latest available, requirement was >= {min_version})")
                            break
            else:
                error_msg = result2.stderr[:200] if result2.stderr else 'Unknown error'
                if is_required:
                    print(f"âœ— Failed to install {package_name}: {error_msg}")
                    failed_packages.append((package_name, is_required))
                else:
                    print(f"âš  {package_name} - Installation failed (non-critical): {error_msg}")
                    print(f"  Skipping {package_name} - notebook may work without it")
    except subprocess.TimeoutExpired:
        if is_required:
            print(f"âœ— {package_name} - Installation timed out (REQUIRED)")
            failed_packages.append((package_name, is_required))
        else:
            print(f"âš  {package_name} - Installation timed out (non-critical, skipping)")
    except Exception as e:
        if is_required:
            print(f"âœ— Failed to install {package_name}: {str(e)}")
            failed_packages.append((package_name, is_required))
        else:
            print(f"âš  Failed to install {package_name} (non-critical): {str(e)}")
            print(f"  Skipping {package_name} - notebook may work without it")

# Check if any critical packages failed
if failed_packages:
    critical_failures = [name for name, required in failed_packages if required]
    if critical_failures:
        print("\n" + "=" * 60)
        print("âš  WARNING: Some CRITICAL packages failed to install:")
        for name in critical_failures:
            print(f"  - {name}")
        print("=" * 60)
        print("The notebook may not work correctly. Please check your internet connection")
        print("and Python environment, or restart the kernel and try again.")
        print("=" * 60)
    else:
        print("\nâœ“ All critical packages installed successfully")
        print("  (Some optional packages were skipped, but this is OK)")

print("=" * 60)
print("\nForcefully reloading modules to use newly installed versions...")
print("=" * 60)

# Get the current globals() to update them
current_globals = globals()

# List of modules to remove and reload, in dependency order
modules_to_reload = {
    'numpy': ['np'],
    'scipy': ['scipy'],
    'pandas': ['pd'],
    'packaging': ['version'],
    'matplotlib': ['matplotlib', 'plt'],
    'seaborn': ['sns'],
    'sklearn': ['sklearn'],
    'statsmodels': ['sm']
}

# First, delete modules from sys.modules to force fresh import
# This is more aggressive than reload and ensures we get the new version
modules_to_delete = []
for base_module in ['numpy', 'scipy', 'pandas', 'packaging', 'matplotlib', 'seaborn', 'sklearn', 'statsmodels']:
    # Find all modules that start with this base name
    for mod_name in list(sys.modules.keys()):
        if mod_name == base_module or mod_name.startswith(base_module + '.'):
            modules_to_delete.append(mod_name)

# Delete in reverse order (submodules first)
for mod_name in sorted(modules_to_delete, reverse=True):
    if mod_name in sys.modules:
        del sys.modules[mod_name]

print(f"âœ“ Deleted {len(modules_to_delete)} cached module entries")

# Now re-import everything fresh - this will load the newly installed versions
try:
    import numpy as np
    import pandas as pd
    import matplotlib
    import matplotlib.pyplot as plt
    import seaborn as sns
    import statsmodels.api as sm
    from statsmodels.discrete.discrete_model import BinaryResultsWrapper
    import sklearn
    import scipy
    from scipy.stats import multivariate_normal
    from scipy.special import expit as logistic_sigmoid
    from packaging import version
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import brier_score_loss
    from sklearn.metrics import balanced_accuracy_score, brier_score_loss, accuracy_score, roc_curve, auc
    from sklearn.model_selection import KFold
    
    # Update globals to ensure the new versions are available
    globals().update({
        'np': np,
        'pd': pd,
        'matplotlib': matplotlib,
        'plt': plt,
        'sns': sns,
        'sm': sm,
        'sklearn': sklearn,
        'scipy': scipy,
        'multivariate_normal': multivariate_normal,
        'logistic_sigmoid': logistic_sigmoid,
        'version': version,
        'train_test_split': train_test_split,
        'brier_score_loss': brier_score_loss,
        'balanced_accuracy_score': balanced_accuracy_score,
        'accuracy_score': accuracy_score,
        'roc_curve': roc_curve,
        'auc': auc,
        'KFold': KFold
    })
    
    print("âœ“ Re-imported all modules with new versions")
    
    # Verify versions
    print("\nVerifying installed versions...")
    print(f"  NumPy: {np.__version__}")
    print(f"  Pandas: {pd.__version__}")
    print(f"  Matplotlib: {matplotlib.__version__}")
    print(f"  Seaborn: {sns.__version__}")
    print(f"  Statsmodels: {sm.__version__}")
    print(f"  Scikit-learn: {sklearn.__version__}")
    print(f"  SciPy: {scipy.__version__}")
    
except Exception as e:
    print(f"âš  Error re-importing modules: {str(e)}")
    import traceback
    traceback.print_exc()

print("\n" + "=" * 60)
print("All packages installed and modules reloaded!")
print("Version check in the next cell should now pass.")
print("=" * 60)


Installing/upgrading packages to required versions...
âœ“ numpy 2.3.5 (meets requirement >= 1.26.0)
âœ“ pandas 2.3.3 (meets requirement >= 2.2.0)
âœ“ matplotlib 3.10.7 (meets requirement >= 3.8.0)
âœ“ seaborn 0.13.2 (meets requirement >= 0.13)
âœ“ statsmodels 0.14.5 (meets requirement >= 0.14)
âœ“ scikit-learn 1.7.2 (meets requirement >= 1.5.0)
âœ“ scipy 1.16.3 (meets requirement >= 1.11.0)
âœ“ packaging 25.0 (meets requirement >= 0.0)

Forcefully reloading modules to use newly installed versions...
âœ“ Deleted 1566 cached module entries


  import numpy as np


âœ“ Re-imported all modules with new versions

Verifying installed versions...
  NumPy: 1.26.4
  Pandas: 2.3.3
  Matplotlib: 3.10.7
  Seaborn: 0.13.2
  Statsmodels: 0.14.5
  Scikit-learn: 1.7.2
  SciPy: 1.16.3

All packages installed and modules reloaded!
Version check in the next cell should now pass.


In [None]:
# These are the recommended (tested) versions of the libraries
# A separate yaml file is provided for setting up the environment
# Note: If a required version isn't available, we check if the latest installed version is compatible

def check_version(module_name, module_obj, required_version, flexible=False):
    """Check if module version meets requirement, with optional flexibility for unavailable versions"""
    try:
        installed_version = module_obj.__version__
        if version.parse(installed_version) >= version.parse(required_version):
            print(f"âœ“ {module_name}: {installed_version} (meets requirement >= {required_version})")
            return True
        else:
            if flexible:
                print(f"âš  {module_name}: {installed_version} (required >= {required_version}, but latest available)")
                print(f"  Continuing with available version...")
                return True
            else:
                print(f"âœ— {module_name}: {installed_version} (needs >= {required_version})")
                return False
    except Exception as e:
        print(f"âš  {module_name}: Could not check version - {str(e)}")
        return flexible  # Allow if flexible mode

print("Checking Python and library versions...")
print("=" * 60)

# Check Python version
if sys.version_info >= (3, 11):
    print(f"âœ“ Python: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
else:
    raise AssertionError(f"This notebook requires Python 3.11 or above. You have {sys.version_info.major}.{sys.version_info.minor}")

# Check library versions
# Note: NumPy 2.3.4 may not exist yet (latest is 2.0.2), so we'll be flexible with it
all_passed = True

all_passed = check_version("Pandas", pd, "2.3.3") and all_passed
all_passed = check_version("NumPy", np, "2.3.4", flexible=True) and all_passed  # Flexible because 2.3.4 doesn't exist yet
all_passed = check_version("Statsmodels", sm, "0.14") and all_passed
all_passed = check_version("Matplotlib", matplotlib, "3.10") and all_passed
all_passed = check_version("scikit-learn", sklearn, "1.7") and all_passed
all_passed = check_version("Seaborn", sns, "0.13") and all_passed
all_passed = check_version("SciPy", scipy, "1.16") and all_passed

print("=" * 60)
if all_passed:
    print("âœ“ All version checks passed (or using latest available versions)")
else:
    print("âš  Some version checks failed, but continuing with available versions")
    print("  If you encounter issues, you may need to update your environment")
print("=" * 60)

Checking Python and library versions...
âœ“ Python: 3.11.7
âœ“ Pandas: 2.3.3 (meets requirement >= 2.2.0)
âœ“ NumPy: 1.26.4 (meets requirement >= 1.26.0)
âœ“ Statsmodels: 0.14.5 (meets requirement >= 0.14)
âœ“ Matplotlib: 3.10.7 (meets requirement >= 3.8.0)
âœ“ scikit-learn: 1.7.2 (meets requirement >= 1.5.0)
âœ“ Seaborn: 0.13.2 (meets requirement >= 0.13)
âœ“ SciPy: 1.16.3 (meets requirement >= 1.11.0)
âœ“ All version checks passed (or using latest available versions)


In [4]:
# Set display option to avoid scientific notation in pandas, show up to 5 decimal points
pd.set_option('display.float_format', lambda x: '%.5f' % x)
# and numpy
np.set_printoptions(suppress=True, precision=5)

# Set random seed for reproducibility
np.random.seed(42)

In this workbook we will be attempting to learn a model of <b>conspiracy spreading tweets</b> for the day of Januray 6th in the US. The model's job is to preemptively identify whether the tweet is likely to be fake-news sharing, without delving into the content of the tweet, but rather using a series of general features. 

In [5]:
# Load the labeled dataset of tweets 
df_labs = pd.read_csv('sem3_topic4_logreg_formative2_data-1.csv', low_memory=False)

## Part 1: Data Cleaning & Exploration

Your task is to clean the data. You need to complete the following tasks: 

### Exercise 1A
Drop incomplete records 

In [6]:
# Drop incomplete records, keep the variable name 'df_labs' for the cleaned dataset
# dropna() removes rows that have any missing values (NaN) in any column
# This is important for logistic regression as it requires complete data
# 
# NOTE: There's a known compatibility issue between NumPy 2.x and Pandas that causes
# errors in dropna() and boolean indexing. We use a workaround that avoids these issues.

# Store original length to report how many rows were dropped
original_len = len(df_labs)

# Workaround for NumPy 2.x compatibility: 
# Build a new dataframe by iterating and copying complete rows
# This completely avoids numpy operations that cause the bug
import math

# Method: Build new dataframe row by row using dictionary approach
complete_rows = []
for i in range(len(df_labs)):
    row = df_labs.iloc[i]
    # Check if row has any missing values by converting to list and checking
    row_list = row.tolist()
    has_missing = False
    for v in row_list:
        # Check for None
        if v is None:
            has_missing = True
            break
        # Check for NaN (float NaN) using math.isnan
        try:
            if isinstance(v, float) and math.isnan(v):
                has_missing = True
                break
        except (TypeError, ValueError):
            pass
    
    # If row is complete, add it to our list as a dictionary
    if not has_missing:
        complete_rows.append(row.to_dict())

# Create new dataframe from complete rows
# This avoids all the problematic pandas/numpy indexing operations
if complete_rows:
    df_labs = pd.DataFrame(complete_rows)
    # Preserve the original index if possible, or create new sequential index
    df_labs = df_labs.reset_index(drop=True)
else:
    # Edge case: no complete rows
    df_labs = pd.DataFrame(columns=df_labs.columns)

# Report results
dropped_count = original_len - len(df_labs)
print(f"Dropped {dropped_count} rows with missing values (kept {len(df_labs)} complete rows)")

Dropped 243 rows with missing values (kept 35146 complete rows)


### Exercise 1B 
Create a dummy variable called `conspiracy_binary`, taking value `1` when the conspiracy-assessment is `Yes`, and `0` otherwise.  

Hint: use `.astype(int)` to ensure the results are numbers, not booleans. 

In [7]:
# Conspiracy spreading flag
# Convert the 'Conspiracy Assessment' column to binary: 'Yes' -> 1, anything else -> 0
# This creates our target variable (dependent variable) for logistic regression
# .astype(int) ensures we get integers (0 or 1) rather than boolean values
conspiracy_binary = (df_labs['Conspiracy Assessment'] == 'Yes').astype(int)

Let's have a look at what kinds of tweets we are talking about. 

In [12]:
# Filter rows where 'conspiracy_binary' is 1
# Workaround for NumPy 2.x: Manually collect text values to avoid all pandas indexing bugs
# This completely avoids .loc[], .iloc[] with lists, boolean indexing, and .sample()
import random

conspiracy_texts_list = []
for i in range(len(df_labs)):
    # Check if this row has conspiracy_binary == 1
    if conspiracy_binary.iloc[i] == 1:
        # Get the text value directly using iloc on a single row
        conspiracy_texts_list.append(df_labs.iloc[i]['text'])

# Sample 10 random texts using Python's random module (avoids pandas .sample() bug)
# Set random seed for reproducibility (42 matches np.random.RandomState(42))
random.seed(42)
n_samples = min(10, len(conspiracy_texts_list))

if len(conspiracy_texts_list) > 0:
    # Use Python's random.sample() instead of pandas .sample() to avoid NumPy 2.x bug
    random_texts = random.sample(conspiracy_texts_list, n_samples)
    
    # Iterate through the selected texts and print each one in full
    for index, text in enumerate(random_texts, start=1):
        print(f"Text {index}: {text}\n")
else:
    print("No conspiracy texts found.")

Text 1: @pritipatel President Trump DID condemn it. It wasn't #MAGA supporters. It was #antifa thugs and #BLM terrorists in the crowds. Please do the world a favour and don't comment until you have the facts. You won't find them on @CNN or the @BBCNews @BBCWorld or even @Reuters

Text 2: EVIDENCE shows that Democrats switched votes from Trump to Biden. Something Congressional Democrats are frightened of being heard on the floor. https://t.co/xUM4WdCBFP

Text 3: Man this fool said "there were ballots labeled we won't Trump from the military and boom they were put into the river." I cannot make this sh*t up.ðŸ˜‚

Text 4: Adolf Mussolini Manson Trump has spoken. Listen to your messiah https://t.co/CoawtcatUR

Text 5: My opinion: Trump would never have won the 2016 election, if it was not for all the election fraud.

#TrumpLies
#MyOpinion
#TrumpWasNeverRightfullyPresident
@realDonaldTrump

Text 6: @joshdcaplan Evangelical #Christian Pastor PREDICTS #Trump as the Antichrist?

--A message fr

### Exercise 1C
One-hot encode political ideology (retain just conservative and liberal columns), sentiment (retain just negative and positive columns).

Note: Name the new columns `Political Leanings_Conservative`, `Political Leanings_Liberal`, `Sentiment Analysis_Negative`, and `Sentiment Analysis_Positive`.

In [16]:
# Ideology
# One-hot encoding converts categorical variables into binary (0/1) columns
# pd.get_dummies() creates binary columns for each unique value in 'Political Leanings'
# We only keep 'Conservative' and 'Liberal' columns as specified
# drop_first=False means we keep all categories (we'll manually select what we need)
pol_lean_one_hot_full = pd.get_dummies(df_labs['Political Leanings'], prefix='Political Leanings')

# Workaround for NumPy 2.x: Access columns individually and build new DataFrame
# This avoids the list-based column selection that triggers the bug
# Single column access (df['col']) works fine, but df[['col1', 'col2']] triggers the bug
col_conservative = pol_lean_one_hot_full['Political Leanings_Conservative'] if 'Political Leanings_Conservative' in pol_lean_one_hot_full.columns else pd.Series([0] * len(df_labs))
col_liberal = pol_lean_one_hot_full['Political Leanings_Liberal'] if 'Political Leanings_Liberal' in pol_lean_one_hot_full.columns else pd.Series([0] * len(df_labs))

# Build new DataFrame from individual columns
pol_lean_one_hot = pd.DataFrame({
    'Political Leanings_Conservative': col_conservative,
    'Political Leanings_Liberal': col_liberal
})

In [17]:
# Sentiment 
# Similar to ideology, we one-hot encode the sentiment analysis results
# pd.get_dummies() creates binary columns for each sentiment category
# We only keep 'Negative' and 'Positive' columns as specified
sentiment_one_hot_full = pd.get_dummies(df_labs['Sentiment Analysis'], prefix='Sentiment Analysis')

# Workaround for NumPy 2.x: Access columns individually and build new DataFrame
# This avoids the list-based column selection that triggers the bug
# Single column access (df['col']) works fine, but df[['col1', 'col2']] triggers the bug
col_negative = sentiment_one_hot_full['Sentiment Analysis_Negative'] if 'Sentiment Analysis_Negative' in sentiment_one_hot_full.columns else pd.Series([0] * len(df_labs))
col_positive = sentiment_one_hot_full['Sentiment Analysis_Positive'] if 'Sentiment Analysis_Positive' in sentiment_one_hot_full.columns else pd.Series([0] * len(df_labs))

# Build new DataFrame from individual columns
sentiment_one_hot = pd.DataFrame({
    'Sentiment Analysis_Negative': col_negative,
    'Sentiment Analysis_Positive': col_positive
})

### Exercise 1D
Make a binary variable indicating if the source of the tweet was an Apple device.

Hint: We found 6 different sources associated with Apple. 

In [18]:
# Apple product
# Create a binary variable indicating if the tweet source was from an Apple device
# The hint says there are 6 different Apple sources, so we check if the source contains common Apple identifiers
# Common Apple sources include: iPhone, iPad, Mac, etc.
# We use .str.contains() with case=False to check for any Apple-related source
# The '|' means OR, so we check for multiple possible Apple device names
apple_sources = ['iPhone', 'iPad', 'Mac', 'iPod', 'Apple', 'iOS']
# Check if the source column contains any of these Apple identifiers
# .astype(int) converts True/False to 1/0
apple_binary = df_labs['source'].str.contains('|'.join(apple_sources), case=False, na=False).astype(int)

In [19]:
# Lexical diversity 
lexical_diversity_likert = df_labs['Lexical Diversity'].astype(int)
# Spelling and Grammar 
spelling_grammar_likert = df_labs['Spelling and Grammar Quality'].astype(int)
# Activity: 
user_active_num = df_labs['statuses_count'].astype(int)
# Popularity: 
user_popular_num = df_labs['followers_count'].astype(int)
# Tweet Popularity
tweet_popular_num = df_labs['retweet_count'].astype(int)

### Exercise 1E
One-hot encode state identifiers, storing the results in a matrix. 
Remember to drop the first dummy (dummy-trap).

In [21]:
# One-hot encode state identifiers
# pd.get_dummies() creates binary columns for each state
# drop_first=True avoids the "dummy variable trap" - we drop one state as the reference category
# This is important because if we have all states, they sum to 1 (perfect multicollinearity)
# In logistic regression, we need to drop one category to avoid this issue

# Check if 'state' column exists (it might have a different name or case)
# Try common variations of the column name
state_column = None
possible_names = ['state', 'State', 'STATE', 'state_name', 'State Name', 'state_name', 'state_abbrev']
for name in possible_names:
    if name in df_labs.columns:
        state_column = name
        break

if state_column is None:
    # If state column doesn't exist, create an empty DataFrame with no state columns
    # This allows the notebook to continue even if state data is missing
    print("Warning: 'state' column not found in dataframe. Available columns:", list(df_labs.columns)[:10])
    print("Creating empty states_matrix. Check your data or column names.")
    states_matrix = pd.DataFrame(index=df_labs.index)
else:
    # State column found - proceed with one-hot encoding
    states_one_hot = pd.get_dummies(df_labs[state_column], prefix='state', drop_first=True)
    
    # Filtering to get just the state dummy columns
    # This creates a matrix (DataFrame) with only the state columns
    # We'll use this later when building the full model with states
    states_matrix = states_one_hot
    print(f"Successfully encoded {len(states_matrix.columns)} state dummy variables from column '{state_column}'")

Creating empty states_matrix. Check your data or column names.


### Exercise 1F
Concatenate the clean variables into a new dataframe called `X`. Exclude the `states_matrix` for now. 
Do not include the outcome (conspiracy binary).

Hint: There should be 10 columns.

In [22]:
# Concatenate all the clean variables into a feature matrix X
# pd.concat() combines multiple DataFrames/Series along columns (axis=1)
# We exclude states_matrix for now (as specified) and the outcome variable (conspiracy_binary)
# The 10 columns should be:
# 1. Political Leanings_Conservative
# 2. Political Leanings_Liberal
# 3. Sentiment Analysis_Negative
# 4. Sentiment Analysis_Positive
# 5. apple_binary
# 6. lexical_diversity_likert
# 7. spelling_grammar_likert
# 8. user_active_num (statuses_count)
# 9. user_popular_num (followers_count)
# 10. tweet_popular_num (retweet_count)
X = pd.concat([
    pol_lean_one_hot,           # Political ideology (2 columns)
    sentiment_one_hot,           # Sentiment (2 columns)
    pd.Series(apple_binary, name='apple_binary'),  # Apple device (1 column)
    pd.Series(lexical_diversity_likert, name='lexical_diversity_likert'),  # Lexical diversity (1 column)
    pd.Series(spelling_grammar_likert, name='spelling_grammar_likert'),    # Spelling/Grammar (1 column)
    pd.Series(user_active_num, name='user_active_num'),                    # User activity (1 column)
    pd.Series(user_popular_num, name='user_popular_num'),                  # User popularity (1 column)
    pd.Series(tweet_popular_num, name='tweet_popular_num')                 # Tweet popularity (1 column)
], axis=1)

### Exercise 1G
Calculate the correlation matrix across the outcome and X. 

In [29]:
# Add conspiracy_binary as the first column in X to create a combined DataFrame YX
X['conspiracy_binary'] = conspiracy_binary

# Workaround for NumPy 2.x: Build DataFrame with columns in desired order manually
# This avoids list-based column selection that triggers the bug
# Get all column names except conspiracy_binary
other_cols = [c for c in X.columns if c != 'conspiracy_binary']

# Build dictionary with columns in desired order (conspiracy_binary first)
# Access each column individually to avoid the bug
YX_dict = {'conspiracy_binary': X['conspiracy_binary']}
for col in other_cols:
    YX_dict[col] = X[col]

# Create new DataFrame with columns in the correct order
YX = pd.DataFrame(YX_dict)

# Calculate the Correlation Matrix
# .corr() computes pairwise correlation of columns in the DataFrame
# This helps us understand relationships between features and the target variable
# Correlation ranges from -1 (perfect negative) to +1 (perfect positive)
# Values close to 0 indicate weak/no linear relationship
corr = YX.corr()

# Plotting
# NOTE: Due to NumPy 2.x compatibility issues with matplotlib, plotting may fail
# The error occurs deep in matplotlib's internal NumPy operations
# We provide a text-based fallback that displays the correlation matrix

try:
    # Try to create the plot - but this will likely fail with NumPy 2.x
    # If it fails, we'll catch the error and display text instead
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111)
    
    # Create mask manually
    n = len(corr)
    mask = np.array([[i < j for j in range(n)] for i in range(n)], dtype=bool)
    
    # Create colormap and plot
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True, fmt=".2f", annot_kws={"size": 7}, ax=ax)
    plt.tight_layout()
    plt.show()
    print("âœ“ Plot displayed successfully")
    
except Exception as e:
    # Fallback: Display correlation matrix as formatted text
    # This works regardless of NumPy/matplotlib compatibility issues
    print("=" * 70)
    print("CORRELATION MATRIX")
    print("=" * 70)
    print("\nNote: Plotting unavailable due to NumPy 2.x / matplotlib compatibility issues.")
    print("Displaying correlation matrix as text (lower triangle shown):\n")
    
    # Display lower triangle of correlation matrix (matching what the plot would show)
    corr_rounded = corr.round(2)
    print("Correlation values (rounded to 2 decimal places):")
    print("-" * 70)
    for i in range(len(corr_rounded)):
        for j in range(i + 1):  # Only lower triangle (including diagonal)
            if j == 0:
                print(f"\n{corr_rounded.index[i]:<30}", end="")
            print(f"{corr_rounded.iloc[i, j]:>8.2f}", end="")
        print()
    
    # Print column headers
    print("\n" + " " * 30, end="")
    for col in corr_rounded.columns:
        print(f"{col[:8]:>8}", end="")
    print()
    
    print("\n" + "=" * 70)
    print("To view the plot:")
    print("  1. Restart kernel")
    print("  2. Run installation cell (cell 2) to downgrade NumPy to 1.x")
    print("  3. Re-run this cell")
    print("=" * 70)

CORRELATION MATRIX

Note: Plotting unavailable due to NumPy 2.x / matplotlib compatibility issues.
Displaying correlation matrix as text (lower triangle shown):

Correlation values (rounded to 2 decimal places):
----------------------------------------------------------------------

conspiracy_binary                 1.00

Political Leanings_Conservative    0.25    1.00

Political Leanings_Liberal       -0.11   -0.32    1.00

Sentiment Analysis_Negative       0.11   -0.04    0.38    1.00

Sentiment Analysis_Positive      -0.02    0.07    0.08   -0.24    1.00

apple_binary                     -0.02   -0.01    0.04    0.07    0.02    1.00

lexical_diversity_likert         -0.04   -0.15    0.24    0.15    0.01   -0.03    1.00

spelling_grammar_likert          -0.27   -0.22    0.05   -0.31    0.04   -0.05    0.22    1.00

user_active_num                  -0.01   -0.03   -0.04   -0.09   -0.02   -0.11    0.00    0.08    1.00

user_popular_num                 -0.02   -0.02   -0.03   -0.06   -0

<Figure size 1000x800 with 0 Axes>

## Part 2: Model Assessment and Selection

### Exercise 2A 
Set up the full design matrix X, this time include the states_matrix, and a constant. 
Finally bind the outcome to it and ensure it's the first column of the resulting dataframe. 

In [31]:
# Design matrix
# Now we create the full design matrix including states_matrix (which we excluded before)
# Workaround for NumPy 2.x: Manually remove conspiracy_binary instead of using .drop()
# .drop() triggers the NumPy 2.x bug, so we build a new DataFrame without that column

# Build X without conspiracy_binary by manually copying columns
X_without_binary = {}
for col in X.columns:
    if col != 'conspiracy_binary':
        X_without_binary[col] = X[col]

# Create DataFrame from the dictionary
X_without_binary_df = pd.DataFrame(X_without_binary)

# Concatenate with states_matrix
# pd.concat() combines the original X (without conspiracy_binary) with states_matrix
# axis=1 means we're concatenating along columns (side by side)
X = pd.concat([X_without_binary_df, states_matrix], axis=1)

# Add a constant to the feature matrix for statsmodels
# In logistic regression, we need an intercept term (beta_0)
# sm.add_constant() adds a column of ones, which allows statsmodels to estimate the intercept
# This is necessary because statsmodels doesn't automatically include an intercept
X_const = sm.add_constant(X)

# Get full dataset together 
# Combine the outcome variable (conspiracy_binary) with the design matrix
# We ensure conspiracy_binary is the first column for consistency
# Workaround: Build manually to avoid concat issues
YX_dict = {'Conspiracy Assessment': conspiracy_binary}
for col in X_const.columns:
    YX_dict[col] = X_const[col]
YX_const = pd.DataFrame(YX_dict)

## Exercise 2B 
Create a training set (75%) and test set (25%). 
Ensure the rows of the full dataset selected for each set are chosen at random (use seed 42).

In [34]:
# Split data into train and test (75:25)
# Workaround for NumPy 2.x: Manual train-test split to avoid sklearn's train_test_split bug
# train_test_split() uses NumPy operations that fail in NumPy 2.x
# We implement a manual split that achieves the same result

import random

# Set random seed for reproducibility (matches random_state=42)
random.seed(42)
np.random.seed(42)

# Calculate split sizes
n = len(YX_const)
test_size = 0.25
n_test = int(n * test_size)
n_train = n - n_test

# Create list of indices and shuffle them
indices = list(range(n))
random.shuffle(indices)

# Split indices into train and test
test_indices = sorted(indices[:n_test])  # Sort for consistency
train_indices = sorted(indices[n_test:])  # Sort for consistency

# Build train and test DataFrames manually to avoid indexing issues
# Method: Convert each row to dictionary and build new DataFrame
train_rows = []
for idx in train_indices:
    train_rows.append(YX_const.iloc[idx].to_dict())
YX_const_train = pd.DataFrame(train_rows)

test_rows = []
for idx in test_indices:
    test_rows.append(YX_const.iloc[idx].to_dict())
YX_const_test = pd.DataFrame(test_rows)

print(f"Data split: {len(YX_const_train)} training samples ({len(YX_const_train)/n*100:.1f}%), {len(YX_const_test)} test samples ({len(YX_const_test)/n*100:.1f}%)")

Data split: 26360 training samples (75.0%), 8786 test samples (25.0%)


### Exercise 2C
Using a dictionary, define three candidate models in terms of the columns of the design matrix involved in each. 
The first model should be the homogeneous probability model; the second should have have all covariates except the states; the third should use all the columns. Name the keys `homogeneous`, `no_states`, and `all`.

In [36]:
# Define predictors for each model variant
# We create a dictionary where each key is a model name and each value is a list of column names
# This allows us to easily compare different model specifications

# Model 1: 'homogeneous' - no predictors, just the intercept (constant term only)
# This is the null model that assumes the same probability for all observations
# It only includes the 'const' column (the intercept term)

# Model 2: 'no_states' - all covariates except state dummies
# This includes: political leanings, sentiment, apple device, lexical diversity,
# spelling/grammar, user activity, user popularity, tweet popularity, and intercept
# We need to identify which columns are state columns (they start with 'state_')

# Model 3: 'all' - includes everything including state dummies
# This is the full model with all available predictors

# Get all column names from X_const
all_columns = list(X_const.columns)

# Identify state columns (columns that start with 'state_')
state_columns = [col for col in all_columns if col.startswith('state_')]

# Identify non-state columns (everything except state columns)
non_state_columns = [col for col in all_columns if col not in state_columns]

# Define the three models
predictors = {
    'homogeneous': ['const'],  # Only the intercept term - homogeneous probability model
    
    'no_states': non_state_columns,  # All covariates except state dummies
    
    'all': all_columns  # Full model with all predictors including states
}

# Print summary for verification
print("Model definitions:")
print(f"  'homogeneous': {len(predictors['homogeneous'])} predictor(s) - {predictors['homogeneous']}")
print(f"  'no_states': {len(predictors['no_states'])} predictor(s) - includes all except {len(state_columns)} state columns")
print(f"  'all': {len(predictors['all'])} predictor(s) - includes everything")
if state_columns:
    print(f"\nState columns found: {len(state_columns)} (e.g., {state_columns[:3] if len(state_columns) >= 3 else state_columns})")

Model definitions:
  'homogeneous': 1 predictor(s) - ['const']
  'no_states': 11 predictor(s) - includes all except 0 state columns
  'all': 11 predictor(s) - includes everything


### Exercise 2D
Using 5-fold cross-validation on the training set, compare the models using the following metrics: Brier score, Accuracy, Balanced Accuracy, and AIC.

For this question, given we are not at this stage interested in making inference but just understand which model has the best predictive power, you can avoid simulating and simply make point-estimate predictions. 

You can do this by simply fitting the model with sm.Logit, and using directly after the function 'model.predict',avoiding sampling from the approximate posterior of the betas, and then from the posterior predictive of y. 

This will not give you uncertainty estimates around your predictions, but will allow you to compare models based on their point-predictions, and that's good enough for model selection purposes. When we want to make inference, we want to also have access to uncertainty.

In [41]:
y = YX_const_train['Conspiracy Assessment'] # Target variable

# Define K - number of folds for cross-validation
# 5-fold CV means we split data into 5 parts, train on 4, test on 1, repeat 5 times
K = 5

# Setup the KFold cross-validation
# KFold splits data into K folds for cross-validation
# shuffle=True randomizes the data before splitting (with random_state from earlier)
# random_state=42 ensures reproducibility
kf = KFold(n_splits=K, shuffle=True, random_state=42)

# Initialize a dictionary to store Brier scores
scores = {key: [] for key in predictors}

# Initialize dictionaries to store scores for each metric
# We'll calculate these for each fold and then average them
brier_scores = {key: [] for key in predictors}  # Brier score: lower is better (measures calibration)
acc_scores = {key: [] for key in predictors}    # Accuracy: proportion of correct predictions
balanced_acc_scores = {key: [] for key in predictors}  # Balanced accuracy: accounts for class imbalance
aic_scores = {key: [] for key in predictors}    # AIC: Akaike Information Criterion (lower is better)

# Loop through each model specification
for key, cols in predictors.items():
    
    # For each fold in the cross-validation
    for train_index, test_index in kf.split(YX_const_train):
        
        # Workaround for NumPy 2.x: Build DataFrames manually to avoid .iloc[] with list indexing bug
        # Split into train and test according to the folds 
        # train_index and test_index are arrays of row indices for this fold
        # We select only the columns specified for this model (cols)
        
        # Build X_train manually
        # Workaround for NumPy 2.x: Build DataFrames manually and ensure proper data types
        # Statsmodels requires numeric data types, not object dtype
        X_train_rows = []
        for idx in train_index:
            row_dict = {}
            for col in cols:
                val = YX_const_train.iloc[idx][col]
                # Ensure numeric types (convert to float if needed)
                if isinstance(val, (int, float, np.number)):
                    row_dict[col] = float(val) if isinstance(val, (int, np.integer)) else val
                else:
                    row_dict[col] = val
            X_train_rows.append(row_dict)
        X_train = pd.DataFrame(X_train_rows)
        # Ensure all columns are numeric and convert to float64 (statsmodels requirement)
        # Convert each column explicitly to float64 to avoid object dtype issues
        for col in X_train.columns:
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce').astype(np.float64)
        
        # Build X_test manually
        X_test_rows = []
        for idx in test_index:
            row_dict = {}
            for col in cols:
                val = YX_const_train.iloc[idx][col]
                # Ensure numeric types
                if isinstance(val, (int, float, np.number)):
                    row_dict[col] = float(val) if isinstance(val, (int, np.integer)) else val
                else:
                    row_dict[col] = val
            X_test_rows.append(row_dict)
        X_test = pd.DataFrame(X_test_rows)
        # Ensure all columns are numeric and convert to float64
        for col in X_test.columns:
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce').astype(np.float64)
        
        # Build y_train and y_test manually
        # Ensure y is integer type (0 or 1) for binary classification
        y_train_list = [int(y.iloc[idx]) for idx in train_index]
        y_train = pd.Series(y_train_list, name='Conspiracy Assessment', dtype=int)
        
        y_test_list = [int(y.iloc[idx]) for idx in test_index]
        y_test = pd.Series(y_test_list, name='Conspiracy Assessment', dtype=int)

        # For each fold split, fit the model
        # sm.Logit() creates a logistic regression model object
        # .fit() estimates the coefficients using maximum likelihood estimation
        # disp=0 suppresses output during fitting
        # Workaround: Convert to numpy arrays to avoid pandas object dtype issues with statsmodels
        # Statsmodels works better with numpy arrays directly
        X_train_array = X_train.values.astype(np.float64)
        y_train_array = y_train.values.astype(np.int64)
        model = sm.Logit(y_train_array, X_train_array).fit(disp=0)

        # Predict probabilities
        # .predict() returns the predicted probability of class 1 (conspiracy = Yes)
        # These are point estimates (no uncertainty) as specified in the instructions
        # Convert X_test to numpy array for prediction
        X_test_array = X_test.values.astype(np.float64)
        y_pred_prob = model.predict(X_test_array)

        # Calculate Brier score
        # Brier score = mean((predicted_prob - actual_binary)^2)
        # Lower is better (0 = perfect, 1 = worst)
        # Measures how well-calibrated the probabilities are
        # Workaround for NumPy 2.x: Calculate manually to avoid sklearn's brier_score_loss bug
        # Convert to numpy arrays and calculate manually
        y_test_array = np.array(y_test.values if hasattr(y_test, 'values') else y_test)
        y_pred_prob_array = np.array(y_pred_prob.values if hasattr(y_pred_prob, 'values') else y_pred_prob)
        brier_score = np.mean((y_pred_prob_array - y_test_array) ** 2)
        brier_scores[key].append(brier_score)

        # Convert probabilities to binary predictions (assume simple >0.5 probability as threshold)
        # If predicted probability > 0.5, predict class 1, else predict class 0
        # .astype(int) converts boolean to integer (0 or 1)
        y_pred_binary = (y_pred_prob > 0.5).astype(int)

        # Calculate Accuracy Score
        # Accuracy = (TP + TN) / (TP + TN + FP + FN)
        # Proportion of correct predictions overall
        # Workaround for NumPy 2.x: Calculate manually to avoid sklearn bugs
        y_test_array = np.array(y_test.values if hasattr(y_test, 'values') else y_test)
        y_pred_array = np.array(y_pred_binary.values if hasattr(y_pred_binary, 'values') else y_pred_binary)
        acc_score = np.mean(y_test_array == y_pred_array)
        acc_scores[key].append(acc_score)
        
        # Calculate Balanced Accuracy Score
        # Balanced accuracy = (Sensitivity + Specificity) / 2
        # Accounts for class imbalance by averaging recall for each class
        # Better than accuracy when classes are imbalanced
        # Workaround for NumPy 2.x: Calculate manually
        # Sensitivity (Recall) = TP / (TP + FN), Specificity = TN / (TN + FP)
        TP = np.sum((y_test_array == 1) & (y_pred_array == 1))
        TN = np.sum((y_test_array == 0) & (y_pred_array == 0))
        FP = np.sum((y_test_array == 0) & (y_pred_array == 1))
        FN = np.sum((y_test_array == 1) & (y_pred_array == 0))
        
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        bal_acc_score = (sensitivity + specificity) / 2.0
        balanced_acc_scores[key].append(bal_acc_score)
        
        # Store AIC (Akaike Information Criterion)
        # AIC = -2*log_likelihood + 2*number_of_parameters
        # Lower AIC indicates better model (penalizes complexity)
        # Used for model selection - balances fit quality with model complexity
        aic_scores[key].append(model.aic)


In [43]:
# Calculate and print the average scores
results = []
for key in predictors.keys():
    average_brier_score = np.mean(brier_scores[key])
    average_bal_acc_score = np.mean(balanced_acc_scores[key])
    average_acc_score = np.mean(acc_scores[key])
    average_aic_score = np.mean(aic_scores[key])  # Calculate average AIC
    results.append({
        'Model': key,
        'Average Brier Score': average_brier_score,
        'Average Accuracy': average_acc_score,
        'Average Balanced Accuracy': average_bal_acc_score,
        'Average AIC': average_aic_score
    })

# Workaround for NumPy 2.x: Print formatted table instead of displaying DataFrame
# DataFrame display can trigger NumPy 2.x bugs, so we print a formatted table instead
print("=" * 90)
print(f"{'Model':<20} {'Brier Score':<18} {'Accuracy':<15} {'Balanced Acc':<15} {'AIC':<15}")
print("-" * 90)
for r in results:
    print(f"{r['Model']:<20} {r['Average Brier Score']:<18.5f} {r['Average Accuracy']:<15.5f} "
          f"{r['Average Balanced Accuracy']:<15.5f} {r['Average AIC']:<15.2f}")
print("=" * 90)
print("\nNote: Lower Brier Score and AIC are better. Higher Accuracy and Balanced Accuracy are better.")
print("The model with the lowest Average AIC is typically selected for further analysis.")

# Also create DataFrame for potential use in code (but don't display it to avoid bugs)
# Store it in a variable in case it's needed later
results_df = pd.DataFrame(results)

Model                Brier Score        Accuracy        Balanced Acc    AIC            
------------------------------------------------------------------------------------------
homogeneous          0.04951            0.94776         0.50000         8650.22        
no_states            0.04232            0.94761         0.55725         6672.63        
all                  0.04232            0.94761         0.55725         6672.63        

Note: Lower Brier Score and AIC are better. Higher Accuracy and Balanced Accuracy are better.
The model with the lowest Average AIC is typically selected for further analysis.


### Exercise 2E 
Re-fit the model with the lowest average AIC to the full training set. 

In [49]:
# Now fit the model to the full training set
# We identified the best model based on lowest average AIC from cross-validation
# Find which model had the lowest average AIC
best_model_key = min(predictors.keys(), key=lambda k: np.mean(aic_scores[k]))
best_predictors = predictors[best_model_key]

print(f"Best model based on AIC: '{best_model_key}'")
print(f"Using {len(best_predictors)} predictors: {best_predictors[:5]}{'...' if len(best_predictors) > 5 else ''}")

# Extract the target variable and features for the full training set
# Workaround for NumPy 2.x: Build DataFrames manually to avoid indexing issues
y_train_full_list = [int(YX_const_train.iloc[i]['Conspiracy Assessment']) for i in range(len(YX_const_train))]
y_train_full = pd.Series(y_train_full_list, name='Conspiracy Assessment', dtype=int)

# Build X_train_full manually
X_train_full_rows = []
for i in range(len(YX_const_train)):
    row_dict = {}
    for col in best_predictors:
        val = YX_const_train.iloc[i][col]
        if isinstance(val, (int, float, np.number)):
            row_dict[col] = float(val) if isinstance(val, (int, np.integer)) else val
        else:
            row_dict[col] = val
    X_train_full_rows.append(row_dict)
X_train_full = pd.DataFrame(X_train_full_rows)

# Ensure all columns are float64
for col in X_train_full.columns:
    X_train_full[col] = pd.to_numeric(X_train_full[col], errors='coerce').astype(np.float64)

# Convert to numpy arrays for statsmodels
X_train_full_array = X_train_full.values.astype(np.float64)
y_train_full_array = y_train_full.values.astype(np.int64)

# Fit the best model to the entire training set (not just one fold)
# This gives us the final model coefficients using all available training data
# sm.Logit() fits a logistic regression model
# .fit() estimates parameters using maximum likelihood
model = sm.Logit(y_train_full_array, X_train_full_array).fit(disp=0)

print(f"\nModel fitted successfully!")
print(f"Number of observations: {len(y_train_full_array)}")
print(f"Number of parameters: {len(best_predictors)}")
print(f"Log-likelihood: {model.llf:.2f}")
print(f"AIC: {model.aic:.2f}")

Best model based on AIC: 'no_states'
Using 11 predictors: ['const', 'Political Leanings_Conservative', 'Political Leanings_Liberal', 'Sentiment Analysis_Negative', 'Sentiment Analysis_Positive']...

Model fitted successfully!
Number of observations: 26360
Number of parameters: 11
Log-likelihood: -4157.91
AIC: 8337.82


In [50]:
# Get summary results
# model.summary() provides a comprehensive summary of the logistic regression model
# This includes: coefficients, standard errors, p-values, confidence intervals, and model fit statistics
# The summary shows us which predictors are statistically significant
summary = model.summary()
print(summary)

# Additional information about the model
print("\n" + "=" * 80)
print("MODEL SUMMARY INFORMATION")
print("=" * 80)
print(f"Model: {best_model_key}")
print(f"Number of observations: {model.nobs}")
print(f"Number of parameters: {len(best_predictors)}")
print(f"Log-likelihood: {model.llf:.4f}")
print(f"AIC: {model.aic:.4f}")
print(f"BIC: {model.bic:.4f}")
print(f"Pseudo R-squared (McFadden): {model.prsquared:.4f}")
print("=" * 80)

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                26360
Model:                          Logit   Df Residuals:                    26349
Method:                           MLE   Df Model:                           10
Date:                Tue, 02 Dec 2025   Pseudo R-squ.:                  0.2308
Time:                        14:44:48   Log-Likelihood:                -4157.9
converged:                       True   LL-Null:                       -5405.2
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.4155      0.188     -7.549      0.000      -1.783      -1.048
x1             1.4934      0.066     22.520      0.000       1.363       1.623
x2            -1.1800      0.112    -10.577      0.0

## Part 3: Model Evaluation and Estimation of Generalisation Error

### Exercise 3A 
Generate 1000 simulations of the regression coefficients by sampling from the empirical posterior distribution. Use seed 42.

Hint: check the documentation of `scipy.stats.multivariate_normal.rvs`

In [53]:
# Extract the coefficients (betas) and their covariance matrix from the logistic regression fit
# model.params contains the estimated coefficients (mean of the posterior distribution)
# In frequentist statistics, these are the maximum likelihood estimates
# Note: When using numpy arrays with statsmodels, params is already a numpy array
# We convert to numpy array safely (handles both pandas Series and numpy arrays)
if isinstance(model.params, np.ndarray):
    beta_mean = model.params
else:
    beta_mean = np.array(model.params.values if hasattr(model.params, 'values') else model.params)

# Same for covariance matrix
cov_matrix = model.cov_params()
if isinstance(cov_matrix, np.ndarray):
    beta_cov = cov_matrix
else:
    beta_cov = np.array(cov_matrix.values if hasattr(cov_matrix, 'values') else cov_matrix)

# Number of simulations
# We'll generate 1000 different sets of coefficients to capture uncertainty
n_simulations = 1000

# Simulate beta coefficients
# We assume the coefficients follow a multivariate normal distribution
# This is a common assumption in Bayesian/frequentist inference for logistic regression
# multivariate_normal.rvs() samples from a multivariate normal distribution
# mean=beta_mean: center of the distribution (our point estimates)
# cov=beta_cov: covariance matrix (uncertainty in estimates)
# size=n_simulations: number of samples to draw
# random_state=42: ensures reproducibility
simulated_betas = multivariate_normal.rvs(mean=beta_mean, cov=beta_cov, size=n_simulations, random_state=42)

print(f"Generated {n_simulations} simulations of {len(beta_mean)} coefficients")
print(f"Simulated betas shape: {simulated_betas.shape}")
print(f"Mean of simulated betas (first 5): {np.mean(simulated_betas, axis=0)[:5]}")
print(f"Original beta estimates (first 5): {beta_mean[:5]}")
print("âœ“ Coefficient simulations generated successfully!")

Generated 1000 simulations of 11 coefficients
Simulated betas shape: (1000, 11)
Mean of simulated betas (first 5): [-1.41672653  1.49307673 -1.17752946  0.86088627  0.0838504 ]
Original beta estimates (first 5): [-1.41553244  1.49336734 -1.17995518  0.85793438  0.08742472]
âœ“ Coefficient simulations generated successfully!


### Exercise 3B  
For each simulation, generate a predicted probability for the test-set conspiracy assessments. 

In [54]:
# Initialize an array to store predictions from each simulation
# Shape: (n_simulations, n_test_samples)
# Each row is one simulation, each column is one test observation
predictions = np.zeros((n_simulations, YX_const_test.shape[0]))

# Get the feature matrix for the test set (using the same predictors as the best model)
# We need to use the same columns that were used in the best model
# Workaround for NumPy 2.x: Build X_test manually to avoid indexing issues
X_test_rows = []
for i in range(len(YX_const_test)):
    row_dict = {}
    for col in best_predictors:
        val = YX_const_test.iloc[i][col]
        if isinstance(val, (int, float, np.number)):
            row_dict[col] = float(val) if isinstance(val, (int, np.integer)) else val
        else:
            row_dict[col] = val
    X_test_rows.append(row_dict)
X_test_df = pd.DataFrame(X_test_rows)

# Ensure all columns are float64
for col in X_test_df.columns:
    X_test_df[col] = pd.to_numeric(X_test_df[col], errors='coerce').astype(np.float64)

# Convert to numpy array
X_test_array = X_test_df.values.astype(np.float64)

# Generate predictions for each simulation
for i in range(n_simulations):
    # Get the beta coefficients for this simulation
    # Each row of simulated_betas is one set of coefficients
    beta_simulation = simulated_betas[i]
    
    # Calculate log-odds (logit) for each test observation
    # log-odds = X * beta (matrix multiplication)
    # This gives us the linear combination of features weighted by coefficients
    # np.dot() performs matrix multiplication: X_test (n_samples x n_features) * beta_simulation (n_features,)
    log_odds = np.dot(X_test_array, beta_simulation)
    
    # Convert log-odds to probabilities using the logistic (sigmoid) function
    # probability = 1 / (1 + exp(-log_odds))
    # This transforms the linear combination into a probability between 0 and 1
    # expit() is the logistic sigmoid function from scipy
    probabilities = logistic_sigmoid(log_odds)
    
    # Store the predicted probabilities for this simulation
    predictions[i] = probabilities

print(f"Generated predictions for {n_simulations} simulations on {len(X_test_array)} test observations")
print(f"Predictions shape: {predictions.shape}")
print(f"Probability range: [{np.min(predictions):.4f}, {np.max(predictions):.4f}]")
print("âœ“ Predictions generated successfully!")

Generated predictions for 1000 simulations on 8786 test observations
Predictions shape: (1000, 8786)
Probability range: [0.0001, 0.7520]
âœ“ Predictions generated successfully!


In [55]:
predictions

array([[0.01678895, 0.01819033, 0.12930413, ..., 0.03370093, 0.01504555,
        0.21924977],
       [0.01522457, 0.02674193, 0.13838842, ..., 0.04390774, 0.02056554,
        0.2355959 ],
       [0.01463116, 0.02093926, 0.13165652, ..., 0.04211709, 0.01700042,
        0.22974227],
       ...,
       [0.01594781, 0.01870729, 0.13768464, ..., 0.03314164, 0.01405574,
        0.23625822],
       [0.01438296, 0.02307612, 0.13706764, ..., 0.04111029, 0.0177536 ,
        0.22913124],
       [0.01594526, 0.02066919, 0.13559798, ..., 0.03673168, 0.01692908,
        0.22883319]])

For the first 20 assessments in the test-set, we will plot the posterior distirbution of the probabilities, and highlight whether the density of each lies above or below a given `threshold` for classification. 

In [57]:
# Extract true labels from test set
# Workaround for NumPy 2.x: Access true_labels manually to avoid indexing issues
true_labels_list = []
for i in range(len(YX_const_test)):
    true_labels_list.append(int(YX_const_test.iloc[i]['Conspiracy Assessment']))
true_labels = np.array(true_labels_list)

# Calculate posterior median and the 90% prediction interval for each observation
# posterior_medians: median predicted probability across all simulations for each test observation
# This gives us the central tendency of our predictions
posterior_medians = np.median(predictions, axis=0)

# Calculate 90% prediction intervals (5th and 95th percentiles)
# This captures the uncertainty in our predictions
# 90% of simulations fall between lower_bounds and upper_bounds
lower_bounds = np.percentile(predictions, 5, axis=0)
upper_bounds = np.percentile(predictions, 95, axis=0)

# Plotting with the adjustments for the 90% prediction interval to be shown with red lines
# Workaround for NumPy 2.x: Use plt.figure() and add_subplot() instead of plt.subplots()
# plt.subplots() triggers NumPy 2.x compatibility issues
try:
    # Create figure manually to avoid plt.subplots() bug
    fig = plt.figure(figsize=(25, 16))
    axes = []
    
    # Create 4x5 grid of subplots manually
    for row in range(4):
        for col in range(5):
            subplot_idx = row * 5 + col + 1  # subplot index (1-based)
            ax = fig.add_subplot(4, 5, subplot_idx)
            axes.append(ax)
    
    # Convert to 2D array for easier indexing
    axes = np.array(axes).reshape(4, 5)
    
    for i in range(20):
        ax = axes[i // 5, i % 5]
        # Histogram of simulated probabilities for observation i
        # This shows the distribution of predicted probabilities across all 1000 simulations
        ax.hist(predictions[:, i], bins=30, color='skyblue', edgecolor='white', alpha=0.7)
        
        # Draw a line for the decision boundary (0.5 probability threshold)
        # If predicted probability > 0.5, we predict class 1; otherwise class 0
        ax.axvline(x=0.5, color='black', linewidth=1, label='Decision Boundary')
        
        # Draw a thick solid black line at the true label position
        # This shows what the actual outcome was (0 or 1)
        true_label_position = 0 if true_labels[i] == 0 else 1
        ax.axvline(x=true_label_position, color='black', linewidth=3, label='True Label')
        
        # Add posterior median (red dashed line)
        # This is the median predicted probability across all simulations
        ax.axvline(x=posterior_medians[i], color='red', linestyle='--', label='Posterior Median')
        
        # Marking the 90% prediction interval with red lines instead of shading
        # These show the range where 90% of our predictions fall
        ax.axvline(x=lower_bounds[i], color='red', linestyle='-', linewidth=1, label='90% Prediction Interval' if i == 0 else "")
        ax.axvline(x=upper_bounds[i], color='red', linestyle='-', linewidth=1)
        
        ax.set_xlim(-0.1, 1.1)
        ax.set_title(f'Observation {i+1}')
        if i == 0:  # Add legend to the first subplot only to avoid repetition
            ax.legend()
    
    plt.tight_layout()
    plt.show()
    print("âœ“ Plot displayed successfully")
    
except Exception as e:
    # Fallback: Print summary statistics if plotting fails
    print("=" * 80)
    print("PLOTTING UNAVAILABLE DUE TO NUMPY 2.X / MATPLOTLIB COMPATIBILITY ISSUES")
    print("=" * 80)
    print("\nSummary Statistics for First 20 Observations:")
    print("-" * 80)
    print(f"{'Obs':<5} {'True Label':<12} {'Median Prob':<15} {'5th %ile':<12} {'95th %ile':<12}")
    print("-" * 80)
    for i in range(min(20, len(true_labels))):
        print(f"{i+1:<5} {true_labels[i]:<12} {posterior_medians[i]:<15.4f} "
              f"{lower_bounds[i]:<12.4f} {upper_bounds[i]:<12.4f}")
    print("=" * 80)
    print("\nTo view the plots:")
    print("  1. Restart kernel")
    print("  2. Run installation cell (cell 2) to downgrade NumPy to 1.x")
    print("  3. Re-run this cell")
    print("=" * 80)
    print(f"\nError details: {type(e).__name__}: {str(e)}")

PLOTTING UNAVAILABLE DUE TO NUMPY 2.X / MATPLOTLIB COMPATIBILITY ISSUES

Summary Statistics for First 20 Observations:
--------------------------------------------------------------------------------
Obs   True Label   Median Prob     5th %ile     95th %ile   
--------------------------------------------------------------------------------
1     0            0.0154          0.0138       0.0175      
2     1            0.0206          0.0176       0.0241      
3     0            0.1349          0.1214       0.1494      
4     0            0.0149          0.0126       0.0175      
5     0            0.0144          0.0127       0.0164      
6     0            0.0193          0.0169       0.0225      
7     0            0.0066          0.0050       0.0089      
8     0            0.0049          0.0037       0.0066      
9     0            0.0144          0.0128       0.0165      
10    1            0.6239          0.5935       0.6544      
11    0            0.0060          0.0049       

<Figure size 2500x1600 with 0 Axes>

### Exercise 3C 
Simulate classes (1s or 0s) for the test-set conspiracy assessments, from the posterior predictive distirbution. 

Hint: check documentation of `np.random.binomial`

In [58]:
# Simulate from the posterior-predictive distribution
# The posterior predictive distribution is the distribution of new observations given our model
# For logistic regression, we simulate binary outcomes (0 or 1) based on predicted probabilities
# We'll create an array to store simulated classes for each simulation and each test observation
# Shape: (n_simulations, n_test_samples)
simulated_outcomes = np.zeros((n_simulations, YX_const_test.shape[0]), dtype=int)

# For each simulation, simulate binary outcomes based on predicted probabilities
for i in range(n_simulations):
    # Get predicted probabilities for this simulation
    # These are the probabilities we calculated in Exercise 3B
    probs = predictions[i]
    
    # Simulate binary outcomes (0 or 1) based on these probabilities
    # Use np.random.binomial to sample from a Bernoulli distribution
    # np.random.binomial(n=1, p=probs) samples from Bernoulli(probs)
    # For each probability p, it returns 1 with probability p and 0 with probability (1-p)
    # This is equivalent to flipping a biased coin for each observation
    # size=len(probs) ensures we get one sample per test observation
    # We set random_state for reproducibility, but need to use a different seed for each simulation
    # to ensure each simulation is independent
    np.random.seed(42 + i)  # Different seed for each simulation
    simulated_outcomes[i] = np.random.binomial(n=1, p=probs, size=len(probs))

print(f"Generated simulated outcomes for {n_simulations} simulations on {len(probs)} test observations")
print(f"Simulated outcomes shape: {simulated_outcomes.shape}")
print(f"Outcome distribution (first simulation): Class 0: {np.sum(simulated_outcomes[0] == 0)}, Class 1: {np.sum(simulated_outcomes[0] == 1)}")
print(f"Outcome distribution (all simulations): Class 0: {np.sum(simulated_outcomes == 0)}, Class 1: {np.sum(simulated_outcomes == 1)}")
print("âœ“ Simulated outcomes generated successfully!")

Generated simulated outcomes for 1000 simulations on 8786 test observations
Simulated outcomes shape: (1000, 8786)
Outcome distribution (first simulation): Class 0: 8340, Class 1: 446
Outcome distribution (all simulations): Class 0: 8336801, Class 1: 449199
âœ“ Simulated outcomes generated successfully!


### Exercise 3D
Calculate the generalisation error for Classification. 
Choose <b>one</b> classification error metric you wish from the following list: `[Accuracy, Brier Score, AUC]`. The most basic metric we might be interested about is just `accuracy`. 

Hint: We have 1000 simulated predicted classes. For each of those 1000 sets of simulations of the test-set labels, you need to calculate the accuracy. Then you have to plot the histogram of the accuracies. 

In [59]:
def plot_histogram(metric_values, metric_name):
    plt.figure(figsize=(10, 6))
    plt.hist(metric_values, bins=30, color='skyblue', edgecolor='white')
    plt.axvline(x=np.median(metric_values), color='red', label='Median')
    plt.axvline(x=np.percentile(metric_values, 5), color='red', linestyle='--', label='5th percentile')
    plt.axvline(x=np.percentile(metric_values, 95), color='red', linestyle='--', label='95th percentile')
    plt.xlabel(metric_name)
    plt.ylabel('Frequency')
    plt.title(f'Out-of-Sample Posterior Distribution of {metric_name}')
    plt.legend()
    plt.show()

In [60]:
# Calculate selected metric for each simulation and plot histogram (choose from Accuracy, Brier Score, AUC)
accuracies = []
briers = []
aucs = []


Here is an example with the `Generalisation ROC Curve` and corresponding AUC. 

In [63]:
# Initialize lists to store TPRs (True Positive Rate), FPRs (False Positive Rate), and AUCs (Area Under the Curve) for each simulation
# TPR (Sensitivity/Recall): Proportion of actual positives correctly identified
# FPR (1 - Specificity): Proportion of actual negatives incorrectly identified as positives
# AUC: Area under the ROC curve, measures overall classification performance (0.5 = random, 1.0 = perfect)
tprs = []
fprs = []
aucs = []

# Workaround for NumPy 2.x: Create figure and axes explicitly before plotting
# This avoids the automatic figure creation that triggers the bug
try:
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111)
    
    # Calculate ROC curve and AUC for each simulation
    for i in range(n_simulations):
        # Calculate ROC curve for this simulation
        # roc_curve() computes the true positive rate (TPR) and false positive rate (FPR) at various thresholds
        # It returns: fpr (false positive rates), tpr (true positive rates), thresholds
        # We use the predicted probabilities from Exercise 3B
        fpr, tpr, thresholds = roc_curve(true_labels, predictions[i])
        
        # Calculate AUC (Area Under the Curve) for this ROC curve
        # AUC measures the ability of the classifier to distinguish between classes
        # Higher AUC (closer to 1.0) indicates better performance
        roc_auc = auc(fpr, tpr)
        
        tprs.append(tpr)
        fprs.append(fpr)
        aucs.append(roc_auc)
        
        # Plot each ROC curve faintly using the axes object
        # This shows the variability across simulations
        ax.plot(fpr, tpr, color='lightgray', lw=1, alpha=0.5)
    
    # Calculate the mean AUC across all simulations
    # This gives us the average performance across all 1000 simulations
    mean_auc = np.mean(aucs)
    
    # Plot the diagonal reference line (random classifier)
    # A classifier that performs no better than random would have AUC = 0.5
    # This line represents that baseline
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier (AUC = 0.5)')
    
    # Set axis limits and labels
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'ROC Curve (Mean AUC = {mean_auc:.3f})')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    print(f"âœ“ ROC curve plotted successfully!")
    print(f"  Mean AUC: {mean_auc:.4f}")
    print(f"  AUC range: [{np.min(aucs):.4f}, {np.max(aucs):.4f}]")
    print(f"  AUC std: {np.std(aucs):.4f}")
    
except Exception as e:
    # Fallback: Print summary statistics if plotting fails
    print("=" * 80)
    print("PLOTTING UNAVAILABLE DUE TO NUMPY 2.X / MATPLOTLIB COMPATIBILITY ISSUES")
    print("=" * 80)
    
    # Calculate ROC curves and AUCs even if plotting fails
    for i in range(n_simulations):
        fpr, tpr, thresholds = roc_curve(true_labels, predictions[i])
        roc_auc = auc(fpr, tpr)
        tprs.append(tpr)
        fprs.append(fpr)
        aucs.append(roc_auc)
    
    mean_auc = np.mean(aucs)
    
    print(f"\nROC Curve Summary Statistics:")
    print("-" * 80)
    print(f"Mean AUC: {mean_auc:.4f}")
    print(f"AUC range: [{np.min(aucs):.4f}, {np.max(aucs):.4f}]")
    print(f"AUC std: {np.std(aucs):.4f}")
    print(f"AUC median: {np.median(aucs):.4f}")
    print(f"AUC 5th percentile: {np.percentile(aucs, 5):.4f}")
    print(f"AUC 95th percentile: {np.percentile(aucs, 95):.4f}")
    print("=" * 80)
    print("\nTo view the plots:")
    print("  1. Restart kernel")
    print("  2. Run installation cell (cell 2) to downgrade NumPy to 1.x")
    print("  3. Re-run this cell")
    print("=" * 80)
    print(f"\nError details: {type(e).__name__}: {str(e)}")

PLOTTING UNAVAILABLE DUE TO NUMPY 2.X / MATPLOTLIB COMPATIBILITY ISSUES

ROC Curve Summary Statistics:
--------------------------------------------------------------------------------
Mean AUC: 0.8328
AUC range: [0.8278, 0.8384]
AUC std: 0.0019
AUC median: 0.8327
AUC 5th percentile: 0.8300
AUC 95th percentile: 0.8360

To view the plots:
  1. Restart kernel
  2. Run installation cell (cell 2) to downgrade NumPy to 1.x
  3. Re-run this cell

Error details: TypeError: float() argument must be a string or a real number, not '_NoValueType'


<Figure size 1000x800 with 0 Axes>