In [None]:
# ==============================================================================
# Cell 1: Setup - Import Libraries
# ==============================================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress warnings (optional, but makes output cleaner)
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")
print("pandas, matplotlib, and seaborn are ready.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ==============================================================================
# Cell 2: Milestone 1.1 - Data Collection (Load Data) [Robust Version]
# ==============================================================================
from google.colab import files
import pandas as pd
import io

print("--- IMPORTANT INSTRUCTIONS ---")
print("1. Find the 'archive.zip' file on your computer and unzip it.")
print("2. Click the 'Choose Files' button below.")
print("3. Select the 'preprocessed_content.csv' file (NOT the .zip file).")
print("4. Wait for the upload to complete.")
print("---------------------------------")

# This will open the file upload dialog
try:
    uploaded = files.upload()

    # Get the filename (it's the first key in the 'uploaded' dictionary)
    file_name = next(iter(uploaded))

    print(f"\nSuccessfully uploaded '{file_name}'")

    if file_name.endswith('.zip'):
        print("\n!!! ERROR: You uploaded a .zip file. Please re-run this cell and upload the 'preprocessed_content.csv' file.")
        raise Exception("Incorrect file type uploaded.")

    print("\nLoading data using the robust 'python' engine...")
    print("(This may take a moment longer.)")

    # --- ROBUST FIXES ---
    # We are using all fixes to handle this complex file:
    # 1. encoding='latin1': Fixes 'utf-8' codec errors.
    # 2. engine='python':     Fixes 'Buffer overflow' errors.
    # 3. on_bad_lines='skip': Skips any other broken rows.

    df = pd.read_csv(
        io.BytesIO(uploaded[file_name]),
        encoding='latin1',
        on_bad_lines='skip',
        engine='python'
    )

    if df.empty:
        print("\n!!! ERROR: The loaded DataFrame is empty.")
        print("This may be because you uploaded the .zip file. Please re-run and upload the .csv file.")
    else:
        print(f"--- Successfully loaded data into DataFrame ---")
        print("\n--- Data Head (First 5 Rows) ---")
        print(df.head())
        print("\n--- Data Info (Columns & Data Types) ---")
        df.info()

except Exception as e:
    # This will catch the 'KeyboardInterrupt' if it happens again
    if "KeyboardInterrupt" in str(e):
        print("\nUpload was canceled. Please re-run the cell and let the upload finish.")
    else:
        print(f"\nAn error occurred: {e}")

In [None]:
# ==============================================================================
# Cell 3: Milestone 1.2 - Preprocessing Pipeline
# ==============================================================================
# Check if the dataframe 'df' was loaded successfully from Cell 2
if 'df' in locals() and not df.empty:
    print("--- Starting preprocessing pipeline ---")

    # --- Fix Index Column ---
    # Check if the first column was loaded as 'Unnamed: 0'
    if 'Unnamed: 0' in df.columns:
        print("Found 'Unnamed: 0' column, setting it as the index.")
        df = df.set_index('Unnamed: 0')

    # Define the columns we are most interested in for the project
    key_columns = ['ticker', 'year', 'preprocessed_content', 'ner_entities', 'e_score', 's_score', 'g_score']

    # Check for missing values in these specific columns
    print("\nMissing Values (Before Cleaning):")
    print(df[key_columns].isnull().sum())

    # For this milestone, we will focus on the scores.
    # Let's drop any rows where the scores are missing.
    df_cleaned = df.dropna(subset=['e_score', 's_score', 'g_score'])

    print(f"\nOriginal entity count: {len(df)}")
    print(f"Entity count after dropping rows with missing scores: {len(df_cleaned)}")

    print("\n--- Preprocessing Complete ---")

else:
    print("!!! ERROR: Dataframe 'df' not found or is empty.")
    print("Please run Cell 2 successfully before running this cell.")

In [None]:
# ==============================================================================
# Cell 4: Milestone 1.3 - Exploratory Data Analysis (EDA)
# ==============================================================================
# Check if the cleaned dataframe 'df_cleaned' exists from Cell 3
if 'df_cleaned' in locals():
    print("--- Starting Exploratory Data Analysis ---")
    print("Generating plots for ESG Score distributions...")

    # Set the plot style
    sns.set_style("whitegrid")

    # Create a figure with 3 subplots (one for each score)
    fig, axes = plt.subplots(3, 1, figsize=(10, 15))
    fig.suptitle('Milestone 1 EDA: Distribution of Financial Entities by ESG Scores', fontsize=16, y=1.02)

    # --- Plot 1: Environmental (E) Score Distribution ---
    sns.histplot(df_cleaned['e_score'], bins=30, kde=True, ax=axes[0], color='green')
    axes[0].set_title('Distribution of Environmental (E) Scores')
    axes[0].set_xlabel('E Score')
    axes[0].set_ylabel('Number of Companies')

    # --- Plot 2: Social (S) Score Distribution ---
    sns.histplot(df_cleaned['s_score'], bins=30, kde=True, ax=axes[1], color='blue')
    axes[1].set_title('Distribution of Social (S) Scores')
    axes[1].set_xlabel('S Score')
    axes[1].set_ylabel('Number of Companies')

    # --- Plot 3: Governance (G) Score Distribution ---
    sns.histplot(df_cleaned['g_score'], bins=30, kde=True, ax=axes[2], color='red')
    axes[2].set_title('Distribution of Governance (G) Scores')
    axes[2].set_xlabel('G Score')
    axes[0].set_ylabel('Number of Companies')

    plt.tight_layout()

    # Display the plots
    print("\nPlots generated successfully.")
    plt.show()

else:
    print("!!! ERROR: Cleaned dataframe 'df_cleaned' not found.")
    print("Please run Cell 3 successfully before running this cell.")

**milestone2**

In [None]:
# ==============================================================================
# Cell 1: Setup - Install and Import Libraries for Milestone 2
# ==============================================================================
# Install spaCy and spaCy Transformers
!pip install -q spacy-transformers
!pip install -q spacy

# Download a pre-trained transformer pipeline
# This is our "selected model" (similar to BERT)
!python -m spacy download en_core_web_trf

import spacy
from spacy.tokens import DocBin, Doc
from spacy.training.example import Example
import re
import ast  # This is for safely evaluating the string-list in 'ner_entities'
import random
from spacy.cli.train import train
from spacy.scorer import Scorer
import json

print("\n--- spaCy and Transformers are ready ---")
print("Loaded 'en_core_web_trf' as our base model.")

In [None]:
# ==============================================================================
# Cell 2: Import All Libraries
# ==============================================================================
# For M1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# For M2
import spacy
from spacy.tokens import DocBin, Doc
from spacy.training.example import Example
import re
import ast  # This is for safely evaluating the string-list in 'ner_entities'
import random
from spacy.cli.train import train
from spacy.scorer import Scorer
import json

# For File Upload
from google.colab import files
import io

# Suppress all warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully.")

In [None]:
# ==============================================================================
# Cell 3: Load Data (Robust Version)
# ==============================================================================

print("--- IMPORTANT INSTRUCTIONS ---")
print("1. Unzip the 'archive.zip' file on your computer first.")
print("2. Click the 'Choose Files' button below.")
print("3. Select the 'preprocessed_content.csv' file (NOT the .zip file).")
print("4. Wait for the upload to complete.")
print("---------------------------------")

try:
    uploaded = files.upload()
    file_name = next(iter(uploaded))
    print(f"\nSuccessfully uploaded '{file_name}'")

    print("\nLoading data using the robust 'python' engine...")

    # Use all fixes: 'latin1' encoding, 'python' engine, and skip bad lines
    df = pd.read_csv(
        io.BytesIO(uploaded[file_name]),
        encoding='latin1',
        on_bad_lines='skip',
        engine='python'
    )

    if df.empty:
        print("\n!!! ERROR: The loaded DataFrame is empty.")
    else:
        print(f"--- Successfully loaded data into DataFrame ---")

except Exception as e:
    print(f"\nAn error occurred: {e}")

In [None]:
# ==============================================================================
# Cell 4: Preprocessing Pipeline (Milestone 1)
# ==============================================================================
if 'df' in locals() and not df.empty:
    print("--- Starting preprocessing pipeline ---")

    # Fix Index Column
    if 'Unnamed: 0' in df.columns:
        print("Found 'Unnamed: 0' column, setting it as the index.")
        df = df.set_index('Unnamed: 0')

    print(f"Original entity count: {len(df)}")
    print("\n--- Data is clean and ready. ---")

    # Display basic info
    df.info()

else:
    print("!!! ERROR: Dataframe 'df' not found or is empty.")
    print("Please run Cell 3 successfully before running this cell.")

In [None]:
# ==============================================================================
# Cell 5: Milestone 2.1 - Preprocessing Effectiveness
# ==============================================================================
if 'df' in locals():
    print("--- Demonstrating Preprocessing (Tokenization & Lemmatization) ---")

    nlp = spacy.load("en_core_web_trf")
    # Get a text sample and ensure it's a string
    sample_text = str(df['preprocessed_content'].dropna().iloc[0])
    sample_text_short = sample_text[:250]

    print(f"Processing sample text:\n'{sample_text_short}...'")
    doc = nlp(sample_text_short)

    print("\n--- Analysis Results ---")
    print(f"{'Token':<15} | {'Lemma (Base Form)':<18} | {'Part of Speech':<15}")
    print("-" * 52)
    for token in doc:
        print(f"{token.text:<15} | {token.lemma_:<18} | {token.pos_:<15}")

    print("\n--- Preprocessing Demo Complete ---")

else:
    print("!!! ERROR: Dataframe 'df' not found. Please re-run Cell 3.")

In [None]:
# ==============================================================================
# Cell 6: Prepare Data for Fine-Tuning
# ==============================================================================
if 'df' in locals():
    print("--- Preparing Data for Training ---")

    df_train = df.dropna(subset=['preprocessed_content', 'ner_entities'])
    df_train = df_train[df_train['preprocessed_content'].str.len() > 50]
    print(f"Cleaned data count: {len(df_train)}")

    training_data = []

    for _, row in df_train.iterrows():
        text = str(row['preprocessed_content'])
        try:
            entities_list = ast.literal_eval(row['ner_entities'])
        except (ValueError, SyntaxError):
            continue

        spans = []
        for ent_string in set(entities_list):
            try:
                ent_string_safe = str(ent_string)
                for match in re.finditer(re.escape(ent_string_safe), text, flags=re.IGNORECASE):
                    start, end = match.span()
                    span = (start, end, "FIN_ENTITY")
                    spans.append(span)
            except re.error:
                continue
        if spans:
            training_data.append((text, {"entities": spans}))

    print(f"\nSuccessfully created {len(training_data)} labeled training examples.")
    if training_data:
        print("\n--- Example of a Training Item ---")
        print(training_data[0])
    else:
        print("\n!!! WARNING: No training data was created. Check 'ner_entities' column.")

else:
    print("!!! ERROR: Dataframe 'df' not found. Please re-run Cell 3.")

In [None]:
# ==============================================================================
# Cell 7: Create DocBin Files (Train/Validation Split)
# ==============================================================================
if 'training_data' in locals() and 'nlp' in locals():
    print("--- Creating DocBin files for training and validation ---")

    random.shuffle(training_data)

    split_point = int(len(training_data) * 0.8)
    train_data = training_data[:split_point]
    test_data = training_data[split_point:]

    print(f"Total examples: {len(training_data)}")
    print(f"Training examples: {len(train_data)}")
    print(f"Validation examples: {len(test_data)}")

    # --- Create train.spacy ---
    db_train = DocBin()
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = spacy.util.filter_spans(ents)
        db_train.add(doc)
    db_train.to_disk("./train.spacy")

    # --- Create dev.spacy (for validation) ---
    db_test = DocBin()
    for text, annotations in test_data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = spacy.util.filter_spans(ents)
        db_test.add(doc)
    db_test.to_disk("./dev.spacy")

    print("\n--- train.spacy and dev.spacy files created successfully! ---")

else:
    print("!!! ERROR: 'training_data' or 'nlp' model not found.")

In [None]:
# ==============================================================================
# Cell 8: Create a Labeled Base Model
# ==============================================================================
# This cell assumes 'nlp' (the transformer model) was loaded in Cell 5

if 'nlp' in locals():
    print("--- Creating a new base model with 'FIN_ENTITY' label ---")

    # Get the 'ner' component from the pipeline
    if "ner" not in nlp.pipe_names:
        print("!!! Error: 'ner' pipe not found in model.")
    else:
        ner = nlp.get_pipe("ner")

        # Add your new custom label to the NER component
        ner.add_label("FIN_ENTITY")

        # Save this modified model to a new directory
        output_dir = "./base_model_with_labels"
        nlp.to_disk(output_dir)

        print(f"\nSuccessfully saved new base model (with label) to '{output_dir}'")
        print("This model is now ready to be fine-tuned.")

else:
    print("!!! ERROR: 'nlp' model not loaded. Please re-run Cell 5.")

In [None]:
# ==============================================================================
# Cell 9: Create Training Config File [CORRECTED]
# ==============================================================================
# We use 'init fill-config' to create a config from our new base model.
# THE FIX: We point to the 'config.cfg' FILE inside the directory,
# not just the directory itself.

!python -m spacy init fill-config ./base_model_with_labels/config.cfg ./config.cfg

print("\n--- config.cfg file created successfully! ---")
print("This config is now perfectly matched to your model.")

In [None]:
# ==============================================================================
# Cell 10 (NEW): Install Missing Lookups Data
# ==============================================================================
# This command installs the data package needed for 'lexeme_norm'
!pip install -q spacy-lookups-data

print("\n--- spacy-lookups-data installed successfully! ---")

In [None]:
# ==============================================================================
# Cell 11: Milestone 2.2 - Model Training (Fine-Tuning)
# ==============================================================================
print("--- Starting Model Training ---")
print("This will take 5-10 minutes. Please wait...")

# We just provide the config file and our data paths.
# No overrides are needed.
# To experiment with hyperparameters, you can change 'max_epochs 5'
!python -m spacy train ./config.cfg --output ./output_model \
--paths.train ./train.spacy \
--paths.dev ./dev.spacy \
--training.max_epochs 5

print("\n--- Training Complete! ---")
print("Your fine-tuned model is saved in 'output_model/model-best'.")

new


In [None]:
# ==============================================================================
# Cell 1: Install Libraries (Simplified & Corrected)
# ==============================================================================
# Uninstall previous attempts to ensure a clean slate
!pip uninstall -y spacy spacy-transformers transformers tokenizers curated-transformers spacy-curated-transformers curated-tokenizers spacy-lookups-data en-core-web-trf

# Install the core libraries - let pip resolve dependencies
print("Installing spaCy, spacy-transformers, and lookups data...")
!pip install -q spacy spacy-transformers spacy-lookups-data

# Download the correct transformer model using the standard command
print("\nDownloading the 'en_core_web_trf' model...")
!python -m spacy download en_core_web_trf

print("\n--- Libraries and Model Installed ---")
print("✅ spaCy")
print("✅ spacy-transformers (and its dependencies like transformers, tokenizers)")
print("✅ spacy-lookups-data")
print("✅ en_core_web_trf model")
print("\n--> IMPORTANT: Please RESTART the Colab Runtime now! <--")
print("(Go to Runtime > Restart session / Restart runtime)")

In [None]:
# ==============================================================================
# Cell 2: Import All Libraries
# ==============================================================================
# For M1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# For M2
import spacy
from spacy.tokens import DocBin, Doc
from spacy.training.example import Example
import re
import ast  # For safely evaluating the string-list in 'ner_entities'
import random
from spacy.cli.train import train
from spacy.scorer import Scorer
import json

# For File Upload
from google.colab import files
import io

# Suppress all warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully.")

In [None]:
# ==============================================================================
# Cell 3: Load Data (Robust Version)
# ==============================================================================

print("--- Please upload your 'preprocessed_content.csv' file ---")
print("(This is the file from inside the .zip)")

try:
    uploaded = files.upload()
    file_name = next(iter(uploaded))
    print(f"\nSuccessfully uploaded '{file_name}'")

    print("\nLoading data using the robust 'python' engine...")

    df = pd.read_csv(
        io.BytesIO(uploaded[file_name]),
        encoding='latin1',
        on_bad_lines='skip',
        engine='python'
    )

    if df.empty:
        print("\n!!! ERROR: The loaded DataFrame is empty.")
    else:
        print(f"--- Successfully loaded data into DataFrame ---")

except Exception as e:
    print(f"\nAn error occurred: {e}")

In [None]:
# ==============================================================================
# Cell 4: Preprocessing Pipeline (Milestone 1 Cleanup)
# ==============================================================================
if 'df' in locals() and not df.empty:
    print("--- Starting preprocessing pipeline ---")

    # Fix Index Column
    if 'Unnamed: 0' in df.columns:
        print("Found 'Unnamed: 0' column, setting it as the index.")
        df = df.set_index('Unnamed: 0')
        df.index.name = None # Clear the index name

    print(f"Original entity count: {len(df)}")
    print("\n--- Data is clean and ready. ---")
    df.info()

else:
    print("!!! ERROR: Dataframe 'df' not found or is empty.")

In [None]:
# ==============================================================================
# Cell 5: Milestone 2.1 - Preprocessing Effectiveness & Load Model
# ==============================================================================
if 'df' in locals():
    print("--- Demonstrating Preprocessing (Tokenization & Lemmatization) ---")

    try:
        # Load the specific transformer model we downloaded
        nlp = spacy.load("en_core_web_trf")
        print("Successfully loaded 'en_core_web_trf' model.")
    except OSError:
        print("!!! ERROR: 'en_core_web_trf' model not found.")
        print("Please ensure Cell 1 ran successfully and you restarted the runtime.")
        nlp = None # Set nlp to None if loading failed

    if nlp: # Proceed only if the model loaded
        sample_text = str(df['preprocessed_content'].dropna().iloc[0])
        sample_text_short = sample_text[:250]

        print(f"\nProcessing sample text:\n'{sample_text_short}...'")
        doc = nlp(sample_text_short)

        print("\n--- Analysis Results ---")
        print(f"{'Token':<15} | {'Lemma (Base Form)':<18} | {'Part of Speech':<15}")
        print("-" * 52)
        for token in doc:
            print(f"{token.text:<15} | {token.lemma_:<18} | {token.pos_:<15}")

        print("\n--- Preprocessing Demo Complete ---")

else:
    print("!!! ERROR: Dataframe 'df' not found. Please re-run Cell 3.")

In [None]:
import torch
print(torch.cuda.is_available())