# Semester 3 Coding Portfolio Topic 4 Formative Part 2/2:
## Evaluating Logistic Regression Predictions

This notebook covers the following topics:
 - logistic regression

This notebook is expected to take around 5 hours to complete.

<b>Formative section</b><br>
Simply complete the given functions such that they pass the automated tests. This part is graded Pass/Fail; you must get 100% correct!
You can submit your notebook through Canvas as often as you like. Make sure to start doing so early to ensure that your code passes all tests!
You may ask for help from fellow students and TAs on this section, and solutions might be provided later on.

In [1]:
# Import Necessary Libraries
import sys
import math
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import BinaryResultsWrapper
import sklearn
import scipy
from scipy.stats import multivariate_normal
from scipy.special import expit as logistic_sigmoid
from packaging import version
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
from sklearn.metrics import balanced_accuracy_score, brier_score_loss, accuracy_score, roc_curve, auc
from sklearn.model_selection import KFold

In [2]:
# Install all necessary packages to avoid kernel restart
# This ensures all dependencies are available before running the notebook
# IMPORTANT: This cell must be run AFTER cell 1 (imports) to reload modules with new versions
import subprocess
import sys
import importlib

# List of packages to install/upgrade with minimum versions
# Format: (package_name, min_version, is_required)
# is_required: True = critical, False = can skip if installation fails
# Note: NumPy 2.3.4 doesn't exist - latest is 2.0.2, but it has compatibility issues
# We'll try NumPy 2.x first, but fall back to 1.x if needed for compatibility
packages = [
    ('numpy', '1.26.0', True),  # Using 1.x for better compatibility (2.x has breaking changes)
    ('pandas', '2.2.0', True),  # Critical - ensure compatible with NumPy
    ('matplotlib', '3.8.0', True),  # Critical for plotting
    ('seaborn', '0.13', False),  # Nice to have for better plots, but not critical
    ('statsmodels', '0.14', True),  # Critical for logistic regression
    ('scikit-learn', '1.5.0', True),  # Critical for metrics and cross-validation
    ('scipy', '1.11.0', True),  # Critical for statistical functions
    ('packaging', '0.0', False)  # Only needed for version checking, not critical
]

print("Installing/upgrading packages to required versions...")
print("=" * 60)

# Install packages using pip with --upgrade to ensure correct versions
installed_versions = {}
failed_packages = []

for package_name, min_version, is_required in packages:
    try:
        # First try to install with the minimum version requirement
        package_spec = f"{package_name}>={min_version}"
        result = subprocess.run(
            [sys.executable, '-m', 'pip', 'install', '--upgrade', package_spec],
            capture_output=True,
            text=True,
            timeout=300  # 5 minute timeout per package
        )
        
        if result.returncode == 0:
            # Installation succeeded - now check what version was actually installed
            check_result = subprocess.run(
                [sys.executable, '-m', 'pip', 'show', package_name],
                capture_output=True,
                text=True
            )
            if check_result.returncode == 0:
                # Extract version from pip show output
                for line in check_result.stdout.split('\n'):
                    if line.startswith('Version:'):
                        installed_version = line.split(':', 1)[1].strip()
                        installed_versions[package_name] = installed_version
                        
                        # Check if installed version meets requirement
                        from packaging import version as pkg_version
                        try:
                            if pkg_version.parse(installed_version) >= pkg_version.parse(min_version):
                                print(f"✓ {package_name} {installed_version} (meets requirement >= {min_version})")
                            else:
                                print(f"⚠ {package_name} {installed_version} (latest available, but < {min_version})")
                                print(f"  Note: Version {min_version} may not be available. Latest is {installed_version}")
                        except:
                            print(f"✓ {package_name} {installed_version} installed")
                        break
            else:
                print(f"✓ {package_name} installed/updated")
        else:
            # If version-specific install failed, try installing latest
            print(f"⚠ {package_name}>={min_version} not available, trying latest version...")
            result2 = subprocess.run(
                [sys.executable, '-m', 'pip', 'install', '--upgrade', package_name],
                capture_output=True,
                text=True,
                timeout=300
            )
            if result2.returncode == 0:
                check_result = subprocess.run(
                    [sys.executable, '-m', 'pip', 'show', package_name],
                    capture_output=True,
                    text=True
                )
                if check_result.returncode == 0:
                    for line in check_result.stdout.split('\n'):
                        if line.startswith('Version:'):
                            installed_version = line.split(':', 1)[1].strip()
                            installed_versions[package_name] = installed_version
                            print(f"⚠ {package_name} {installed_version} (latest available, requirement was >= {min_version})")
                            break
            else:
                error_msg = result2.stderr[:200] if result2.stderr else 'Unknown error'
                if is_required:
                    print(f"✗ Failed to install {package_name}: {error_msg}")
                    failed_packages.append((package_name, is_required))
                else:
                    print(f"⚠ {package_name} - Installation failed (non-critical): {error_msg}")
                    print(f"  Skipping {package_name} - notebook may work without it")
    except subprocess.TimeoutExpired:
        if is_required:
            print(f"✗ {package_name} - Installation timed out (REQUIRED)")
            failed_packages.append((package_name, is_required))
        else:
            print(f"⚠ {package_name} - Installation timed out (non-critical, skipping)")
    except Exception as e:
        if is_required:
            print(f"✗ Failed to install {package_name}: {str(e)}")
            failed_packages.append((package_name, is_required))
        else:
            print(f"⚠ Failed to install {package_name} (non-critical): {str(e)}")
            print(f"  Skipping {package_name} - notebook may work without it")

# Check if any critical packages failed
if failed_packages:
    critical_failures = [name for name, required in failed_packages if required]
    if critical_failures:
        print("\n" + "=" * 60)
        print("⚠ WARNING: Some CRITICAL packages failed to install:")
        for name in critical_failures:
            print(f"  - {name}")
        print("=" * 60)
        print("The notebook may not work correctly. Please check your internet connection")
        print("and Python environment, or restart the kernel and try again.")
        print("=" * 60)
    else:
        print("\n✓ All critical packages installed successfully")
        print("  (Some optional packages were skipped, but this is OK)")

print("=" * 60)
print("\nForcefully reloading modules to use newly installed versions...")
print("=" * 60)

# Get the current globals() to update them
current_globals = globals()

# List of modules to remove and reload, in dependency order
modules_to_reload = {
    'numpy': ['np'],
    'scipy': ['scipy'],
    'pandas': ['pd'],
    'packaging': ['version'],
    'matplotlib': ['matplotlib', 'plt'],
    'seaborn': ['sns'],
    'sklearn': ['sklearn'],
    'statsmodels': ['sm']
}

# First, delete modules from sys.modules to force fresh import
# This is more aggressive than reload and ensures we get the new version
modules_to_delete = []
for base_module in ['numpy', 'scipy', 'pandas', 'packaging', 'matplotlib', 'seaborn', 'sklearn', 'statsmodels']:
    # Find all modules that start with this base name
    for mod_name in list(sys.modules.keys()):
        if mod_name == base_module or mod_name.startswith(base_module + '.'):
            modules_to_delete.append(mod_name)

# Delete in reverse order (submodules first)
for mod_name in sorted(modules_to_delete, reverse=True):
    if mod_name in sys.modules:
        del sys.modules[mod_name]

print(f"✓ Deleted {len(modules_to_delete)} cached module entries")

# Now re-import everything fresh - this will load the newly installed versions
try:
    import numpy as np
    import pandas as pd
    import matplotlib
    import matplotlib.pyplot as plt
    import seaborn as sns
    import statsmodels.api as sm
    from statsmodels.discrete.discrete_model import BinaryResultsWrapper
    import sklearn
    import scipy
    from scipy.stats import multivariate_normal
    from scipy.special import expit as logistic_sigmoid
    from packaging import version
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import brier_score_loss
    from sklearn.metrics import balanced_accuracy_score, brier_score_loss, accuracy_score, roc_curve, auc
    from sklearn.model_selection import KFold
    
    # Update globals to ensure the new versions are available
    globals().update({
        'np': np,
        'pd': pd,
        'matplotlib': matplotlib,
        'plt': plt,
        'sns': sns,
        'sm': sm,
        'sklearn': sklearn,
        'scipy': scipy,
        'multivariate_normal': multivariate_normal,
        'logistic_sigmoid': logistic_sigmoid,
        'version': version,
        'train_test_split': train_test_split,
        'brier_score_loss': brier_score_loss,
        'balanced_accuracy_score': balanced_accuracy_score,
        'accuracy_score': accuracy_score,
        'roc_curve': roc_curve,
        'auc': auc,
        'KFold': KFold
    })
    
    print("✓ Re-imported all modules with new versions")
    
    # Verify versions
    print("\nVerifying installed versions...")
    print(f"  NumPy: {np.__version__}")
    print(f"  Pandas: {pd.__version__}")
    print(f"  Matplotlib: {matplotlib.__version__}")
    print(f"  Seaborn: {sns.__version__}")
    print(f"  Statsmodels: {sm.__version__}")
    print(f"  Scikit-learn: {sklearn.__version__}")
    print(f"  SciPy: {scipy.__version__}")
    
except Exception as e:
    print(f"⚠ Error re-importing modules: {str(e)}")
    import traceback
    traceback.print_exc()

print("\n" + "=" * 60)
print("All packages installed and modules reloaded!")
print("Version check in the next cell should now pass.")
print("=" * 60)


Installing/upgrading packages to required versions...
✓ numpy 2.3.5 (meets requirement >= 1.26.0)
✓ pandas 2.3.3 (meets requirement >= 2.2.0)
✓ matplotlib 3.10.7 (meets requirement >= 3.8.0)
✓ seaborn 0.13.2 (meets requirement >= 0.13)
✓ statsmodels 0.14.5 (meets requirement >= 0.14)
✓ scikit-learn 1.7.2 (meets requirement >= 1.5.0)
✓ scipy 1.16.3 (meets requirement >= 1.11.0)
✓ packaging 25.0 (meets requirement >= 0.0)

Forcefully reloading modules to use newly installed versions...
✓ Deleted 1566 cached module entries


  import numpy as np


✓ Re-imported all modules with new versions

Verifying installed versions...
  NumPy: 1.26.4
  Pandas: 2.3.3
  Matplotlib: 3.10.7
  Seaborn: 0.13.2
  Statsmodels: 0.14.5
  Scikit-learn: 1.7.2
  SciPy: 1.16.3

All packages installed and modules reloaded!
Version check in the next cell should now pass.


In [3]:
# These are the recommended (tested) versions of the libraries
# A separate yaml file is provided for setting up the environment
# Note: If a required version isn't available, we check if the latest installed version is compatible

def check_version(module_name, module_obj, required_version, flexible=False):
    """Check if module version meets requirement, with optional flexibility for unavailable versions"""
    try:
        installed_version = module_obj.__version__
        if version.parse(installed_version) >= version.parse(required_version):
            print(f"✓ {module_name}: {installed_version} (meets requirement >= {required_version})")
            return True
        else:
            if flexible:
                print(f"⚠ {module_name}: {installed_version} (required >= {required_version}, but latest available)")
                print(f"  Continuing with available version...")
                return True
            else:
                print(f"✗ {module_name}: {installed_version} (needs >= {required_version})")
                return False
    except Exception as e:
        print(f"⚠ {module_name}: Could not check version - {str(e)}")
        return flexible  # Allow if flexible mode

print("Checking Python and library versions...")
print("=" * 60)

# Check Python version
if sys.version_info >= (3, 11):
    print(f"✓ Python: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
else:
    raise AssertionError(f"This notebook requires Python 3.11 or above. You have {sys.version_info.major}.{sys.version_info.minor}")

# Check library versions
# Note: We're using more flexible version requirements to ensure compatibility
# NumPy 2.x has breaking changes, so we're using 1.x for better compatibility
all_passed = True

all_passed = check_version("Pandas", pd, "2.2.0", flexible=True) and all_passed
all_passed = check_version("NumPy", np, "1.26.0", flexible=True) and all_passed  # Using 1.x for compatibility
all_passed = check_version("Statsmodels", sm, "0.14", flexible=True) and all_passed
all_passed = check_version("Matplotlib", matplotlib, "3.8.0", flexible=True) and all_passed
all_passed = check_version("scikit-learn", sklearn, "1.5.0", flexible=True) and all_passed
all_passed = check_version("Seaborn", sns, "0.13", flexible=True) and all_passed
all_passed = check_version("SciPy", scipy, "1.11.0", flexible=True) and all_passed

print("=" * 60)
if all_passed:
    print("✓ All version checks passed (or using latest available versions)")
else:
    print("⚠ Some version checks failed, but continuing with available versions")
    print("  If you encounter issues, you may need to update your environment")
print("=" * 60)

Checking Python and library versions...
✓ Python: 3.11.7
✓ Pandas: 2.3.3 (meets requirement >= 2.2.0)
✓ NumPy: 1.26.4 (meets requirement >= 1.26.0)
✓ Statsmodels: 0.14.5 (meets requirement >= 0.14)
✓ Matplotlib: 3.10.7 (meets requirement >= 3.8.0)
✓ scikit-learn: 1.7.2 (meets requirement >= 1.5.0)
✓ Seaborn: 0.13.2 (meets requirement >= 0.13)
✓ SciPy: 1.16.3 (meets requirement >= 1.11.0)
✓ All version checks passed (or using latest available versions)


In [4]:
# Set display option to avoid scientific notation in pandas, show up to 5 decimal points
pd.set_option('display.float_format', lambda x: '%.5f' % x)
# and numpy
np.set_printoptions(suppress=True, precision=5)

# Set random seed for reproducibility
np.random.seed(42)

In this workbook we will be attempting to learn a model of <b>conspiracy spreading tweets</b> for the day of Januray 6th in the US. The model's job is to preemptively identify whether the tweet is likely to be fake-news sharing, without delving into the content of the tweet, but rather using a series of general features. 

In [5]:
# Load the labeled dataset of tweets 
df_labs = pd.read_csv('sem3_topic4_logreg_formative2_data-1.csv', low_memory=False)

## Part 1: Data Cleaning & Exploration

Your task is to clean the data. You need to complete the following tasks: 

### Exercise 1A
Drop incomplete records 

In [None]:
# Drop incomplete records, keep the variable name 'df_labs' for the cleaned dataset
# dropna() removes rows that have any missing values (NaN) in any column
# This is important for logistic regression as it requires complete data
# 
# NOTE: There's a known compatibility issue between NumPy 2.x and Pandas that causes
# errors in dropna() and boolean indexing. We use a workaround that avoids these issues.

# Store original length to report how many rows were dropped
original_len = len(df_labs)

# Workaround for NumPy 2.x compatibility: 
# Build a new dataframe by iterating and copying complete rows
# This completely avoids numpy operations that cause the bug
import math

# Method: Build new dataframe row by row using dictionary approach
complete_rows = []
for i in range(len(df_labs)):
    row = df_labs.iloc[i]
    # Check if row has any missing values by converting to list and checking
    row_list = row.tolist()
    has_missing = False
    for v in row_list:
        # Check for None
        if v is None:
            has_missing = True
            break
        # Check for NaN (float NaN) using math.isnan
        try:
            if isinstance(v, float) and math.isnan(v):
                has_missing = True
                break
        except (TypeError, ValueError):
            pass
    
    # If row is complete, add it to our list as a dictionary
    if not has_missing:
        complete_rows.append(row.to_dict())

# Create new dataframe from complete rows
# This avoids all the problematic pandas/numpy indexing operations
if complete_rows:
    df_labs = pd.DataFrame(complete_rows)
    # Preserve the original index if possible, or create new sequential index
    df_labs = df_labs.reset_index(drop=True)
else:
    # Edge case: no complete rows
    df_labs = pd.DataFrame(columns=df_labs.columns)

# Report results
dropped_count = original_len - len(df_labs)
print(f"Dropped {dropped_count} rows with missing values (kept {len(df_labs)} complete rows)")

TypeError: int() argument must be a string, a bytes-like object or a real number, not '_NoValueType'

### Exercise 1B 
Create a dummy variable called `conspiracy_binary`, taking value `1` when the conspiracy-assessment is `Yes`, and `0` otherwise.  

Hint: use `.astype(int)` to ensure the results are numbers, not booleans. 

In [None]:
# Conspiracy spreading flag
# Convert the 'Conspiracy Assessment' column to binary: 'Yes' -> 1, anything else -> 0
# This creates our target variable (dependent variable) for logistic regression
# .astype(int) ensures we get integers (0 or 1) rather than boolean values
conspiracy_binary = (df_labs['Conspiracy Assessment'] == 'Yes').astype(int)

Let's have a look at what kinds of tweets we are talking about. 

In [None]:
# Filter rows where 'conspiracy_binary' is 1
conspiracy_texts = df_labs.loc[conspiracy_binary == 1, 'text']

# Sample 10 random texts
random_texts = conspiracy_texts.sample(n=10, random_state=np.random.RandomState())

# Iterate through the selected texts and print each one in full
for index, text in enumerate(random_texts, start=1):
    print(f"Text {index}: {text}\n")

### Exercise 1C
One-hot encode political ideology (retain just conservative and liberal columns), sentiment (retain just negative and positive columns).

Note: Name the new columns `Political Leanings_Conservative`, `Political Leanings_Liberal`, `Sentiment Analysis_Negative`, and `Sentiment Analysis_Positive`.

In [None]:
# Ideology
# One-hot encoding converts categorical variables into binary (0/1) columns
# pd.get_dummies() creates binary columns for each unique value in 'Political Leanings'
# We only keep 'Conservative' and 'Liberal' columns as specified
# drop_first=False means we keep all categories (we'll manually select what we need)
pol_lean_one_hot = pd.get_dummies(df_labs['Political Leanings'], prefix='Political Leanings')
# Select only the Conservative and Liberal columns
pol_lean_one_hot = pol_lean_one_hot[['Political Leanings_Conservative', 'Political Leanings_Liberal']]

In [None]:
# Sentiment 
# Similar to ideology, we one-hot encode the sentiment analysis results
# pd.get_dummies() creates binary columns for each sentiment category
# We only keep 'Negative' and 'Positive' columns as specified
sentiment_one_hot = pd.get_dummies(df_labs['Sentiment Analysis'], prefix='Sentiment Analysis')
# Select only the Negative and Positive columns
sentiment_one_hot = sentiment_one_hot[['Sentiment Analysis_Negative', 'Sentiment Analysis_Positive']]

### Exercise 1D
Make a binary variable indicating if the source of the tweet was an Apple device.

Hint: We found 6 different sources associated with Apple. 

In [None]:
# Apple product
# Create a binary variable indicating if the tweet source was from an Apple device
# The hint says there are 6 different Apple sources, so we check if the source contains common Apple identifiers
# Common Apple sources include: iPhone, iPad, Mac, etc.
# We use .str.contains() with case=False to check for any Apple-related source
# The '|' means OR, so we check for multiple possible Apple device names
apple_sources = ['iPhone', 'iPad', 'Mac', 'iPod', 'Apple', 'iOS']
# Check if the source column contains any of these Apple identifiers
# .astype(int) converts True/False to 1/0
apple_binary = df_labs['source'].str.contains('|'.join(apple_sources), case=False, na=False).astype(int)

In [None]:
# Lexical diversity 
lexical_diversity_likert = df_labs['Lexical Diversity'].astype(int)
# Spelling and Grammar 
spelling_grammar_likert = df_labs['Spelling and Grammar Quality'].astype(int)
# Activity: 
user_active_num = df_labs['statuses_count'].astype(int)
# Popularity: 
user_popular_num = df_labs['followers_count'].astype(int)
# Tweet Popularity
tweet_popular_num = df_labs['retweet_count'].astype(int)

### Exercise 1E
One-hot encode state identifiers, storing the results in a matrix. 
Remember to drop the first dummy (dummy-trap).

In [None]:
# One-hot encode state identifiers
# pd.get_dummies() creates binary columns for each state
# drop_first=True avoids the "dummy variable trap" - we drop one state as the reference category
# This is important because if we have all states, they sum to 1 (perfect multicollinearity)
# In logistic regression, we need to drop one category to avoid this issue
states_one_hot = pd.get_dummies(df_labs['state'], prefix='state', drop_first=True)

# Filtering to get just the state dummy columns
# This creates a matrix (DataFrame) with only the state columns
# We'll use this later when building the full model with states
states_matrix = states_one_hot

### Exercise 1F
Concatenate the clean variables into a new dataframe called `X`. Exclude the `states_matrix` for now. 
Do not include the outcome (conspiracy binary).

Hint: There should be 10 columns.

In [None]:
# Concatenate all the clean variables into a feature matrix X
# pd.concat() combines multiple DataFrames/Series along columns (axis=1)
# We exclude states_matrix for now (as specified) and the outcome variable (conspiracy_binary)
# The 10 columns should be:
# 1. Political Leanings_Conservative
# 2. Political Leanings_Liberal
# 3. Sentiment Analysis_Negative
# 4. Sentiment Analysis_Positive
# 5. apple_binary
# 6. lexical_diversity_likert
# 7. spelling_grammar_likert
# 8. user_active_num (statuses_count)
# 9. user_popular_num (followers_count)
# 10. tweet_popular_num (retweet_count)
X = pd.concat([
    pol_lean_one_hot,           # Political ideology (2 columns)
    sentiment_one_hot,           # Sentiment (2 columns)
    pd.Series(apple_binary, name='apple_binary'),  # Apple device (1 column)
    pd.Series(lexical_diversity_likert, name='lexical_diversity_likert'),  # Lexical diversity (1 column)
    pd.Series(spelling_grammar_likert, name='spelling_grammar_likert'),    # Spelling/Grammar (1 column)
    pd.Series(user_active_num, name='user_active_num'),                    # User activity (1 column)
    pd.Series(user_popular_num, name='user_popular_num'),                  # User popularity (1 column)
    pd.Series(tweet_popular_num, name='tweet_popular_num')                 # Tweet popularity (1 column)
], axis=1)

### Exercise 1G
Calculate the correlation matrix across the outcome and X. 

In [None]:
# Add conspiracy_binary as the first column in X to create a combined DataFrame YX
X['conspiracy_binary'] = conspiracy_binary
YX = X[['conspiracy_binary'] + [c for c in X.columns if c != 'conspiracy_binary']]  # Ensure conspiracy_binary is the first column

# Calculate the Correlation Matrix
# .corr() computes pairwise correlation of columns in the DataFrame
# This helps us understand relationships between features and the target variable
# Correlation ranges from -1 (perfect negative) to +1 (perfect positive)
# Values close to 0 indicate weak/no linear relationship
corr = YX.corr()

# Plotting
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True, fmt=".2f", annot_kws={"size": 7})
plt.tight_layout()
plt.show()

## Part 2: Model Assessment and Selection

### Exercise 2A 
Set up the full design matrix X, this time include the states_matrix, and a constant. 
Finally bind the outcome to it and ensure it's the first column of the resulting dataframe. 

In [None]:
# Design matrix
# Now we create the full design matrix including states_matrix (which we excluded before)
# pd.concat() combines the original X (without conspiracy_binary) with states_matrix
# axis=1 means we're concatenating along columns (side by side)
X = pd.concat([X.drop('conspiracy_binary', axis=1), states_matrix], axis=1)

# Add a constant to the feature matrix for statsmodels
# In logistic regression, we need an intercept term (beta_0)
# sm.add_constant() adds a column of ones, which allows statsmodels to estimate the intercept
# This is necessary because statsmodels doesn't automatically include an intercept
X_const = sm.add_constant(X)

# Get full dataset together 
# Combine the outcome variable (conspiracy_binary) with the design matrix
# We ensure conspiracy_binary is the first column for consistency
YX_const = pd.concat([pd.Series(conspiracy_binary, name='Conspiracy Assessment'), X_const], axis=1)

## Exercise 2B 
Create a training set (75%) and test set (25%). 
Ensure the rows of the full dataset selected for each set are chosen at random (use seed 42).

In [None]:
# Split data into train and test (75:25)
# train_test_split() randomly divides the data into training and testing sets
# test_size=0.25 means 25% goes to test set, 75% to training set
# random_state=42 ensures reproducibility - same random split every time
# shuffle=True means data is shuffled before splitting (good practice)
# stratify=None means we don't stratify by class (could use YX_const['Conspiracy Assessment'] if we wanted balanced splits)
YX_const_train, YX_const_test = train_test_split(YX_const, test_size=0.25, random_state=42, shuffle=True)

### Exercise 2C
Using a dictionary, define three candidate models in terms of the columns of the design matrix involved in each. 
The first model should be the homogeneous probability model; the second should have have all covariates except the states; the third should use all the columns. Name the keys `homogeneous`, `no_states`, and `all`.

In [None]:
# Define predictors for each model variant
# We create a dictionary where each key is a model name and each value is a list of column names
# This allows us to easily compare different model specifications

# Model 1: 'homogeneous' - no predictors, just the intercept (constant term only)
# This is the null model that assumes the same probability for all observations
predictors = {
    'homogeneous': ['const'],  # Only the intercept term
    
    # Model 2: 'no_states' - all covariates except state dummies
    # This includes: political leanings, sentiment, apple device, lexical diversity,
    # spelling/grammar, user activity, user popularity, tweet popularity, and intercept
    'no_states': ['const', 'Political Leanings_Conservative', 'Political Leanings_Liberal',
                  'Sentiment Analysis_Negative', 'Sentiment Analysis_Positive',
                  'apple_binary', 'lexical_diversity_likert', 'spelling_grammar_likert',
                  'user_active_num', 'user_popular_num', 'tweet_popular_num'],
    
    # Model 3: 'all' - includes everything including state dummies
    # This is the full model with all available predictors
    'all': list(X_const.columns)  # All columns from the design matrix including states
}

### Exercise 2D
Using 5-fold cross-validation on the training set, compare the models using the following metrics: Brier score, Accuracy, Balanced Accuracy, and AIC.

For this question, given we are not at this stage interested in making inference but just understand which model has the best predictive power, you can avoid simulating and simply make point-estimate predictions. 

You can do this by simply fitting the model with sm.Logit, and using directly after the function 'model.predict',avoiding sampling from the approximate posterior of the betas, and then from the posterior predictive of y. 

This will not give you uncertainty estimates around your predictions, but will allow you to compare models based on their point-predictions, and that's good enough for model selection purposes. When we want to make inference, we want to also have access to uncertainty.

In [None]:
y = YX_const_train['Conspiracy Assessment'] # Target variable

# Define K - number of folds for cross-validation
# 5-fold CV means we split data into 5 parts, train on 4, test on 1, repeat 5 times
K = 5

# Setup the KFold cross-validation
# KFold splits data into K folds for cross-validation
# shuffle=True randomizes the data before splitting (with random_state from earlier)
# random_state=42 ensures reproducibility
kf = KFold(n_splits=K, shuffle=True, random_state=42)

# Initialize a dictionary to store Brier scores
scores = {key: [] for key in predictors}

# Initialize dictionaries to store scores for each metric
# We'll calculate these for each fold and then average them
brier_scores = {key: [] for key in predictors}  # Brier score: lower is better (measures calibration)
acc_scores = {key: [] for key in predictors}    # Accuracy: proportion of correct predictions
balanced_acc_scores = {key: [] for key in predictors}  # Balanced accuracy: accounts for class imbalance
aic_scores = {key: [] for key in predictors}    # AIC: Akaike Information Criterion (lower is better)

# Loop through each model specification
for key, cols in predictors.items():
    
    # For each fold in the cross-validation
    for train_index, test_index in kf.split(YX_const_train):
        
        # Split into train and test according to the folds 
        # train_index and test_index are arrays of row indices for this fold
        # We select only the columns specified for this model (cols)
        X_train, X_test = YX_const_train.iloc[train_index][cols], YX_const_train.iloc[test_index][cols]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # For each fold split, fit the model
        # sm.Logit() creates a logistic regression model object
        # .fit() estimates the coefficients using maximum likelihood estimation
        # disp=0 suppresses output during fitting
        model = sm.Logit(y_train, X_train).fit(disp=0)

        # Predict probabilities
        # .predict() returns the predicted probability of class 1 (conspiracy = Yes)
        # These are point estimates (no uncertainty) as specified in the instructions
        y_pred_prob = model.predict(X_test)

        # Calculate Brier score
        # Brier score = mean((predicted_prob - actual_binary)^2)
        # Lower is better (0 = perfect, 1 = worst)
        # Measures how well-calibrated the probabilities are
        brier_score = brier_score_loss(y_test, y_pred_prob)
        brier_scores[key].append(brier_score)

        # Convert probabilities to binary predictions (assume simple >0.5 probability as threshold)
        # If predicted probability > 0.5, predict class 1, else predict class 0
        # .astype(int) converts boolean to integer (0 or 1)
        y_pred_binary = (y_pred_prob > 0.5).astype(int)

        # Calculate Accuracy Score
        # Accuracy = (TP + TN) / (TP + TN + FP + FN)
        # Proportion of correct predictions overall
        acc_score = accuracy_score(y_test, y_pred_binary)
        acc_scores[key].append(acc_score)
        
        # Calculate Balanced Accuracy Score
        # Balanced accuracy = (Sensitivity + Specificity) / 2
        # Accounts for class imbalance by averaging recall for each class
        # Better than accuracy when classes are imbalanced
        bal_acc_score = balanced_accuracy_score(y_test, y_pred_binary)
        balanced_acc_scores[key].append(bal_acc_score)
        
        # Store AIC (Akaike Information Criterion)
        # AIC = -2*log_likelihood + 2*number_of_parameters
        # Lower AIC indicates better model (penalizes complexity)
        # Used for model selection - balances fit quality with model complexity
        aic_scores[key].append(model.aic)


In [None]:
# Calculate and print the average scores
results = []
for key in predictors.keys():
    average_brier_score = np.mean(brier_scores[key])
    average_bal_acc_score = np.mean(balanced_acc_scores[key])
    average_acc_score = np.mean(acc_scores[key])
    average_aic_score = np.mean(aic_scores[key])  # Calculate average AIC
    results.append({
        'Model': key,
        'Average Brier Score': average_brier_score,
        'Average Accuracy': average_acc_score,
        'Average Balanced Accuracy': average_bal_acc_score,
        'Average AIC': average_aic_score
    })

# Convert results to DataFrame for nicer display
results_df = pd.DataFrame(results)
results_df

### Exercise 2E 
Re-fit the model with the lowest average AIC to the full training set. 

In [None]:
# Now fit the model to the full training set
# We identified the best model based on lowest average AIC from cross-validation
# Find which model had the lowest average AIC
best_model_key = min(predictors.keys(), key=lambda k: np.mean(aic_scores[k]))
best_predictors = predictors[best_model_key]

# Extract the target variable and features for the full training set
y_train_full = YX_const_train['Conspiracy Assessment']
X_train_full = YX_const_train[best_predictors]

# Fit the best model to the entire training set (not just one fold)
# This gives us the final model coefficients using all available training data
# sm.Logit() fits a logistic regression model
# .fit() estimates parameters using maximum likelihood
model = sm.Logit(y_train_full, X_train_full).fit(disp=0)

In [None]:
# Get summary results
summary = model.summary()
print(summary)

## Part 3: Model Evaluation and Estimation of Generalisation Error

### Exercise 3A 
Generate 1000 simulations of the regression coefficients by sampling from the empirical posterior distribution. Use seed 42.

Hint: check the documentation of `scipy.stats.multivariate_normal.rvs`

In [None]:
# Extract the coefficients (betas) and their covariance matrix from the logistic regression fit
# model.params contains the estimated coefficients (mean of the posterior distribution)
# In frequentist statistics, these are the maximum likelihood estimates
beta_mean = model.params

# model.cov_params() contains the covariance matrix of the coefficients
# This tells us how uncertain we are about each coefficient and how they covary
# The diagonal elements are variances, off-diagonal are covariances
beta_cov = model.cov_params()

# Number of simulations
# We'll generate 1000 different sets of coefficients to capture uncertainty
n_simulations = 1000

# Simulate beta coefficients
# We assume the coefficients follow a multivariate normal distribution
# This is a common assumption in Bayesian/frequentist inference for logistic regression
# multivariate_normal.rvs() samples from a multivariate normal distribution
# mean=beta_mean: center of the distribution (our point estimates)
# cov=beta_cov: covariance matrix (uncertainty in estimates)
# size=n_simulations: number of samples to draw
# random_state=42: ensures reproducibility
simulated_betas = multivariate_normal.rvs(mean=beta_mean, cov=beta_cov, size=n_simulations, random_state=42)

### Exercise 3B  
For each simulation, generate a predicted probability for the test-set conspiracy assessments. 

In [None]:
# Initialize an array to store predictions from each simulation
# Shape: (n_simulations, n_test_samples)
# Each row is one simulation, each column is one test observation
predictions = np.zeros((n_simulations, YX_const_test.shape[0]))

# Get the feature matrix for the test set (using the same predictors as the best model)
# We need to use the same columns that were used in the best model
X_test = YX_const_test[best_predictors]

# Generate predictions for each simulation
for i in range(n_simulations):
    # Get the beta coefficients for this simulation
    # Each row of simulated_betas is one set of coefficients
    beta_simulation = simulated_betas[i]
    
    # Calculate log-odds (logit) for each test observation
    # log-odds = X * beta (matrix multiplication)
    # This gives us the linear combination of features weighted by coefficients
    # np.dot() performs matrix multiplication: X_test (n_samples x n_features) * beta_simulation (n_features,)
    log_odds = np.dot(X_test, beta_simulation)
    
    # Convert log-odds to probabilities using the logistic (sigmoid) function
    # probability = 1 / (1 + exp(-log_odds))
    # This transforms the linear combination into a probability between 0 and 1
    # expit() is the logistic sigmoid function from scipy
    probabilities = logistic_sigmoid(log_odds)
    
    # Store the predicted probabilities for this simulation
    predictions[i] = probabilities

In [None]:
predictions

For the first 20 assessments in the test-set, we will plot the posterior distirbution of the probabilities, and highlight whether the density of each lies above or below a given `threshold` for classification. 

In [None]:
true_labels = YX_const_test['Conspiracy Assessment']

# Calculate posterior median and the 90% prediction interval for each of the first 10 observations
posterior_medians = np.median(predictions, axis=0)
lower_bounds = np.percentile(predictions, 5, axis=0)
upper_bounds = np.percentile(predictions, 95, axis=0)

# Plotting with the adjustments for the 90% prediction interval to be shown with red lines
fig, axes = plt.subplots(4, 5, figsize=(25, 16))

for i in range(20):
    ax = axes[i // 5, i % 5]
    # Histogram of simulated probabilities for observation i
    ax.hist(predictions[:, i], bins=30, color='skyblue', edgecolor='white', alpha=0.7)
    
    # Draw a line for the decision boundary 
    ax.axvline(x=0.5, color='black', linewidth=1, label='Decision Boundary')
    
    # Draw a thick solid black line at the true label position
    true_label_position = 0 if true_labels.iloc[i] == 0 else 1  # Determine the position based on the true label
    ax.axvline(x=true_label_position, color='black', linewidth=3, label='True Label')
    
    # Add posterior median
    ax.axvline(x=posterior_medians[i], color='red', linestyle='--', label='Posterior Median')
    
    # Marking the 90% prediction interval with red lines instead of shading
    ax.axvline(x=lower_bounds[i], color='red', linestyle='-', linewidth=1, label='90% Prediction Interval' if i == 0 else "")
    ax.axvline(x=upper_bounds[i], color='red', linestyle='-', linewidth=1)
    
    ax.set_xlim(-0.1, 1.1)
    ax.set_title(f'Observation {i+1}')
    if i == 0:  # Add legend to the first subplot only to avoid repetition
        ax.legend()

plt.tight_layout()
plt.show()

### Exercise 3C 
Simulate classes (1s or 0s) for the test-set conspiracy assessments, from the posterior predictive distirbution. 

Hint: check documentation of `np.random.binomial`

In [None]:
# Simulate from the posterior-predictive distribution 
# The posterior-predictive distribution gives us the distribution of possible outcomes
# given our uncertainty about the model parameters
# For each test observation and each simulation, we sample a binary outcome (0 or 1)
# based on the predicted probability

# np.random.binomial() samples from a binomial distribution
# n=1: we're doing one trial (one coin flip) per observation
# p=predictions: the probability of success (class 1) for each observation in each simulation
# This gives us a 2D array: (n_simulations, n_test_samples)
# Each value is either 0 or 1, sampled according to the predicted probability
# random_state=42 ensures reproducibility
simulated_outcomes = np.random.binomial(n=1, p=predictions, size=predictions.shape, random_state=42)

### Exercise 3D
Calculate the generalisation error for Classification. 
Choose <b>one</b> classification error metric you wish from the following list: `[Accuracy, Brier Score, AUC]`. The most basic metric we might be interested about is just `accuracy`. 

Hint: We have 1000 simulated predicted classes. For each of those 1000 sets of simulations of the test-set labels, you need to calculate the accuracy. Then you have to plot the histogram of the accuracies. 

In [None]:
def plot_histogram(metric_values, metric_name):
    plt.figure(figsize=(10, 6))
    plt.hist(metric_values, bins=30, color='skyblue', edgecolor='white')
    plt.axvline(x=np.median(metric_values), color='red', label='Median')
    plt.axvline(x=np.percentile(metric_values, 5), color='red', linestyle='--', label='5th percentile')
    plt.axvline(x=np.percentile(metric_values, 95), color='red', linestyle='--', label='95th percentile')
    plt.xlabel(metric_name)
    plt.ylabel('Frequency')
    plt.title(f'Out-of-Sample Posterior Distribution of {metric_name}')
    plt.legend()
    plt.show()

In [None]:
# Calculate selected metric for each simulation and plot histogram (choose from Accuracy, Brier Score, AUC)
# We'll calculate all three metrics to show the generalization error distribution

# Get the true labels for the test set
true_labels = YX_const_test['Conspiracy Assessment'].values

# Initialize lists to store metrics for each simulation
accuracies = []  # Accuracy for each simulation
briers = []      # Brier score for each simulation
aucs = []        # AUC (Area Under ROC Curve) for each simulation

# Loop through each simulation
for i in range(n_simulations):
    # For each simulation, we have:
    # - predictions[i]: predicted probabilities for all test observations
    # - simulated_outcomes[i]: simulated binary outcomes for all test observations
    
    # Calculate Accuracy using simulated binary outcomes
    # Accuracy = proportion of correct predictions
    # We compare simulated outcomes to true labels
    acc = accuracy_score(true_labels, simulated_outcomes[i])
    accuracies.append(acc)
    
    # Calculate Brier Score using predicted probabilities
    # Brier score measures calibration (how well probabilities match outcomes)
    # Lower is better
    brier = brier_score_loss(true_labels, predictions[i])
    briers.append(brier)
    
    # Calculate AUC (Area Under ROC Curve) using predicted probabilities
    # AUC measures the model's ability to distinguish between classes
    # Higher is better (1.0 = perfect, 0.5 = random)
    # roc_curve() computes the ROC curve, auc() calculates the area under it
    fpr, tpr, thresholds = roc_curve(true_labels, predictions[i])
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

# Plot histograms for all three metrics
print("Accuracy Distribution:")
plot_histogram(accuracies, "Accuracy")

print("\nBrier Score Distribution:")
plot_histogram(briers, "Brier Score")

print("\nAUC Distribution:")
plot_histogram(aucs, "AUC")


Here is an example with the `Generalisation ROC Curve` and corresponding AUC. 

In [None]:
# Initialize lists to store TPRs (True Positive Rate), FPRs (False Positive Rare), and AUCs (Area Under the Curve) for each simulation
tprs = []
fprs = []
aucs = []

# Calculate ROC curve and AUC for each simulation
for i in range(n_simulations):
    fpr, tpr, thresholds = roc_curve(true_labels, predictions[i])
    roc_auc = auc(fpr, tpr)
    tprs.append(tpr)
    fprs.append(fpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, color='lightgray', lw=1, alpha=0.5)  # Plot each ROC curve faintly

# Calculate the mean AUC
mean_auc = np.mean(aucs)

# Plotting
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve (Mean AUC = {mean_auc:.2f})')
plt.show()