In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import (
    precision_recall_fscore_support, classification_report,
    accuracy_score, confusion_matrix, roc_auc_score
)
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
import chardet
from collections import Counter
import random
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import logging

warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Download required NLTK data
print("Downloading NLTK resources...")
try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    logger.info("NLTK resources downloaded successfully")
except Exception as e:
    logger.warning(f"NLTK download warning: {e}")

class EnhancedUserStoryDataProcessor:
    """
    Enhanced data processor with robust encoding detection and validation
    """

    def __init__(self, train_folder, test_folder):
        self.train_folder = train_folder
        self.test_folder = test_folder
        self.stop_words = set(stopwords.words('english'))
        self.processed_files = []
        self.failed_files = []

    def detect_encoding(self, file_path):
        """Detect file encoding with confidence scoring"""
        try:
            with open(file_path, 'rb') as file:
                raw_data = file.read()
                result = chardet.detect(raw_data)
                confidence = result.get('confidence', 0)
                encoding = result.get('encoding', 'utf-8')

                # Use utf-8 if confidence is too low
                if confidence < 0.7:
                    encoding = 'utf-8'

                return encoding
        except Exception as e:
            logger.warning(f"Encoding detection failed for {file_path}: {e}")
            return 'utf-8'

    def validate_user_story(self, story):
        """Validate if text is a proper user story"""
        if not story or len(story.strip()) < 10:
            return False

        story_lower = story.lower().strip()

        # Check for user story patterns
        has_role = bool(re.search(r'as an?\s+\w+', story_lower))
        has_action = any(word in story_lower for word in ['want', 'need', 'should', 'must', 'can'])
        has_benefit = 'so that' in story_lower or 'in order to' in story_lower

        # Must have at least role and action
        return has_role and has_action

    def load_txt_files(self, folder_path):
        """Enhanced file loading with validation and error tracking"""
        stories = []
        filenames = []

        if not os.path.exists(folder_path):
            logger.error(f"Folder not found: {folder_path}")
            return [], []

        txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
        logger.info(f"Found {len(txt_files)} .txt files in {folder_path}")

        for filename in txt_files:
            file_path = os.path.join(folder_path, filename)
            try:
                # Detect and use appropriate encoding
                encoding = self.detect_encoding(file_path)
                encodings_to_try = [encoding, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']

                content = None
                used_encoding = None

                for enc in encodings_to_try:
                    try:
                        with open(file_path, 'r', encoding=enc) as file:
                            content = file.read().strip()
                            used_encoding = enc
                            break
                    except UnicodeDecodeError:
                        continue

                if content:
                    # Split and validate user stories
                    individual_stories = self.split_user_stories(content)
                    valid_stories = 0

                    for i, story in enumerate(individual_stories):
                        if self.validate_user_story(story):
                            stories.append(story.strip())
                            filenames.append(f"{filename}_story_{i+1}")
                            valid_stories += 1

                    self.processed_files.append({
                        'filename': filename,
                        'encoding': used_encoding,
                        'total_stories': len(individual_stories),
                        'valid_stories': valid_stories
                    })

                    logger.info(f"Processed {filename}: {valid_stories}/{len(individual_stories)} valid stories")
                else:
                    self.failed_files.append(filename)
                    logger.error(f"Could not read {filename} with any encoding")

            except Exception as e:
                self.failed_files.append(filename)
                logger.error(f"Error processing {filename}: {e}")

        logger.info(f"Successfully extracted {len(stories)} valid user stories from {folder_path}")
        return stories, filenames

    def split_user_stories(self, content):
        """Enhanced user story splitting with better pattern recognition"""
        # Clean content
        content = re.sub(r'[^\x00-\x7F]+', ' ', content)
        content = re.sub(r'\s+', ' ', content).strip()

        # Enhanced splitting patterns
        patterns = [
            r'\n\s*As\s+an?\s+',
            r'\.\s*As\s+an?\s+',
            r'\n\n+As\s+an?\s+',
            r'(?<=\.)\s*As\s+an?\s+',
            r'(?<=\n)\s*As\s+an?\s+'
        ]

        stories = [content]

        for pattern in patterns:
            new_stories = []
            for story in stories:
                parts = re.split(pattern, story, flags=re.IGNORECASE)
                if len(parts) > 1:
                    # Add first part if valid
                    if parts[0].strip() and self.validate_user_story(parts[0].strip()):
                        new_stories.append(parts[0].strip())

                    # Add remaining parts with "As a/an" prefix
                    for part in parts[1:]:
                        if part.strip():
                            # Determine article
                            next_word = part.split()[0].lower() if part.split() else ""
                            article = "an" if next_word and next_word[0] in 'aeiou' else "a"
                            reconstructed = f"As {article} {part.strip()}"
                            new_stories.append(reconstructed)
                else:
                    new_stories.append(story)
            stories = new_stories

        # Final validation and cleaning
        cleaned_stories = []
        for story in stories:
            story = story.strip()
            if self.validate_user_story(story):
                cleaned_stories.append(story)

        return cleaned_stories if cleaned_stories else [content]

    def convert_to_csv(self):
        """Convert txt files to CSV with comprehensive metadata"""
        logger.info("Starting TXT to CSV conversion...")

        # Load data
        train_stories, train_files = self.load_txt_files(self.train_folder)
        test_stories, test_files = self.load_txt_files(self.test_folder)

        # Create DataFrames with metadata
        train_df = pd.DataFrame({
            'filename': train_files,
            'user_story': train_stories,
            'split': 'train',
            'story_id': [f"train_{i}" for i in range(len(train_stories))],
            'word_count': [len(story.split()) for story in train_stories],
            'char_count': [len(story) for story in train_stories]
        })

        test_df = pd.DataFrame({
            'filename': test_files,
            'user_story': test_stories,
            'split': 'test',
            'story_id': [f"test_{i}" for i in range(len(test_stories))],
            'word_count': [len(story.split()) for story in test_stories],
            'char_count': [len(story) for story in test_stories]
        })

        # Combine datasets
        combined_df = pd.concat([train_df, test_df], ignore_index=True)

        # Save with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_filename = f'user_stories_{timestamp}.csv'
        combined_df.to_csv(csv_filename, index=False)

        # Log statistics
        logger.info(f"Created {csv_filename} with {len(combined_df)} stories")
        logger.info(f"Train: {len(train_df)}, Test: {len(test_df)}")
        logger.info(f"Processed files: {len(self.processed_files)}")
        logger.info(f"Failed files: {len(self.failed_files)}")

        return combined_df

class AdvancedDataAugmenter:
    """
    Advanced data augmentation with multiple techniques
    """

    def __init__(self):
        self.synonyms = {
            'user': ['customer', 'client', 'end-user', 'person', 'individual'],
            'want': ['need', 'require', 'wish', 'desire', 'would like'],
            'system': ['application', 'platform', 'software', 'tool', 'service'],
            'should': ['must', 'shall', 'ought to', 'needs to', 'has to'],
            'can': ['able to', 'capable of', 'may', 'could'],
            'will': ['shall', 'going to', 'intends to', 'plans to'],
            'create': ['generate', 'build', 'make', 'develop', 'produce'],
            'manage': ['handle', 'control', 'oversee', 'administer', 'supervise'],
            'view': ['see', 'display', 'show', 'observe', 'examine'],
            'update': ['modify', 'change', 'edit', 'revise', 'alter']
        }

        self.role_synonyms = {
            'administrator': ['admin', 'manager', 'supervisor'],
            'developer': ['programmer', 'engineer', 'coder'],
            'user': ['customer', 'client', 'person']
        }

    def synonym_replacement(self, story, replacement_prob=0.3):
        """Replace words with synonyms"""
        words = story.split()
        new_words = []

        for word in words:
            word_lower = word.lower().strip('.,!?;:')
            if (word_lower in self.synonyms and
                random.random() < replacement_prob):
                synonym = random.choice(self.synonyms[word_lower])
                new_word = synonym if word.islower() else synonym.capitalize()
                new_words.append(new_word)
            else:
                new_words.append(word)

        return ' '.join(new_words)

    def role_variation(self, story):
        """Create variations with different roles"""
        variations = [story]

        # Extract current role
        role_match = re.search(r'As an?\s+(\w+)', story, re.IGNORECASE)
        if role_match:
            current_role = role_match.group(1).lower()
            if current_role in self.role_synonyms:
                for synonym in self.role_synonyms[current_role]:
                    new_story = re.sub(
                        r'(As an?\s+)\w+',
                        f'\\1{synonym}',
                        story,
                        flags=re.IGNORECASE
                    )
                    variations.append(new_story)

        return variations

    def paraphrase_story(self, story):
        """Generate multiple paraphrased versions"""
        variations = [story]  # Original

        # Synonym replacement
        syn_version = self.synonym_replacement(story)
        if syn_version != story:
            variations.append(syn_version)

        # Role variations
        role_variations = self.role_variation(story)
        variations.extend([v for v in role_variations if v not in variations])

        # Sentence reordering for complex stories
        sentences = [s.strip() for s in story.split('.') if s.strip()]
        if len(sentences) > 2:
            shuffled_sentences = sentences.copy()
            random.shuffle(shuffled_sentences)
            reordered = '. '.join(shuffled_sentences) + '.'
            if reordered not in variations:
                variations.append(reordered)

        return variations[:4]  # Max 4 variations

    def augment_data(self, df, augmentation_factor=3):
        """Enhanced data augmentation with balancing"""
        logger.info("Starting advanced data augmentation...")

        augmented_data = []
        augmentation_stats = {'original': 0, 'synonym': 0, 'role': 0, 'reorder': 0}

        for idx, row in df.iterrows():
            original_story = row['user_story']
            variations = self.paraphrase_story(original_story)

            # Add variations up to augmentation factor
            for i, aug_story in enumerate(variations[:augmentation_factor]):
                aug_type = 'original' if i == 0 else f'aug_{i}'
                augmented_data.append({
                    'filename': f"{row['filename']}_{aug_type}",
                    'user_story': aug_story,
                    'split': row['split'],
                    'story_id': f"{row['story_id']}_{aug_type}",
                    'original_filename': row['filename'],
                    'augmentation_type': aug_type,
                    'word_count': len(aug_story.split()),
                    'char_count': len(aug_story)
                })

                if i == 0:
                    augmentation_stats['original'] += 1
                else:
                    augmentation_stats[f'aug_{i}'] = augmentation_stats.get(f'aug_{i}', 0) + 1

        augmented_df = pd.DataFrame(augmented_data)

        # Save augmented data
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        aug_filename = f'augmented_data_{timestamp}.csv'
        augmented_df.to_csv(aug_filename, index=False)

        logger.info(f"Created {aug_filename} with {len(augmented_df)} stories")
        logger.info(f"Augmentation stats: {augmentation_stats}")

        return augmented_df

class ComprehensiveUserStoryQuantifier:
    """
    Enhanced quantifier with improved scoring algorithms
    """

    def __init__(self):
        self.elements = [
            'task_identification', 'task_nature', 'role_identification',
            'end_result_acceptance_criteria', 'dependency', 'business_need',
            'priority', 'quality_requirement', 'estimable', 'unambiguous',
            'well_formed', 'problem_oriented', 'unique', 'testable'
        ]

        # Enhanced pattern matching with weights
        self.patterns = {
            'task_identification': {
                'patterns': [
                    r'\b(add|create|manage|track|process|generate|submit|upload|download)\b',
                    r'\b(view|edit|delete|search|filter|sort|schedule|assign|update)\b',
                    r'\b(configure|setup|install|remove|modify|customize)\b'
                ],
                'weight': 1.0
            },
            'task_nature': {
                'patterns': [
                    r'\b(want to|need to|should|must|will|can|able to)\b',
                    r'\b(have to|require|wish to|intend to|would like to)\b'
                ],
                'weight': 1.2
            },
            'role_identification': {
                'patterns': [
                    r'\bas an?\s+\w+',
                    r'\bas an?\s+(user|admin|customer|manager|developer)',
                    r'\bas an?\s+\w+\s+(user|admin|member|staff)'
                ],
                'weight': 1.5
            },
            'end_result_acceptance_criteria': {
                'patterns': [
                    r'\bso that\b',
                    r'\bin order to\b',
                    r'\bbecause\b',
                    r'\bto ensure\b',
                    r'\benabling\b'
                ],
                'weight': 1.3
            },
            'dependency': {
                'patterns': [
                    r'\b(after|before|once|when|first|then)\b',
                    r'\b(requires|depends on|prerequisite|following)\b'
                ],
                'weight': 0.8
            },
            'business_need': {
                'patterns': [
                    r'\b(improve|efficiency|compliance|service|business)\b',
                    r'\b(value|benefit|goal|objective|performance|quality)\b'
                ],
                'weight': 0.9
            }
        }

    def calculate_pattern_score(self, story, element):
        """Calculate weighted pattern score"""
        if element not in self.patterns:
            return self._heuristic_score(story, element)

        story_lower = story.lower()
        pattern_info = self.patterns[element]
        patterns = pattern_info['patterns']
        weight = pattern_info['weight']

        total_score = 0
        for pattern in patterns:
            matches = len(re.findall(pattern, story_lower))
            total_score += min(matches * 0.4, 1.0)

        # Normalize and apply weight
        normalized_score = min(total_score / len(patterns), 1.0)
        weighted_score = min(normalized_score * weight, 1.0)

        return weighted_score

    def _heuristic_score(self, story, element):
        """Enhanced heuristic scoring for non-pattern elements"""
        story_lower = story.lower()

        if element == 'unambiguous':
            ambiguous_words = ['maybe', 'possibly', 'might', 'could', 'perhaps',
                             'sometimes', 'usually', 'probably', 'potentially']
            specific_words = ['specific', 'exactly', 'precisely', 'must', 'will',
                            'shall', 'always', 'all', 'every', 'clearly']

            ambiguous_count = sum(1 for word in ambiguous_words if word in story_lower)
            specific_count = sum(1 for word in specific_words if word in story_lower)

            base_score = 0.7
            score = base_score + (specific_count * 0.1) - (ambiguous_count * 0.2)
            return max(0, min(1.0, score))

        elif element == 'problem_oriented':
            problem_indicators = ['problem', 'issue', 'challenge', 'need', 'want',
                                'require', 'lack', 'difficulty', 'concern', 'pain']
            solution_indicators = ['implement', 'code', 'develop', 'build', 'create',
                                 'design', 'program', 'construct', 'technical']

            problem_score = sum(1 for word in problem_indicators if word in story_lower)
            solution_score = sum(1 for word in solution_indicators if word in story_lower)

            total_indicators = problem_score + solution_score
            if total_indicators == 0:
                return 0.5

            problem_ratio = problem_score / total_indicators
            return min(problem_ratio * 1.2, 1.0)

        elif element == 'unique':
            words = story_lower.split()
            if not words:
                return 0.0

            unique_words = len(set(words))
            total_words = len(words)
            uniqueness_ratio = unique_words / total_words

            # Bonus for specific details
            has_numbers = bool(re.search(r'\d+', story))
            has_specific_terms = any(term in story_lower for term in
                                   ['specific', 'particular', 'exactly', 'precisely'])

            bonus = 0.1 * (has_numbers + has_specific_terms)
            return min(uniqueness_ratio + bonus, 1.0)

        elif element == 'testable':
            testable_indicators = ['verify', 'test', 'check', 'validate', 'confirm',
                                 'ensure', 'measure', 'track', 'monitor']
            measurable_patterns = [r'\d+', r'\b(all|every|each|successfully|correctly|properly)\b']

            testable_score = sum(1 for word in testable_indicators if word in story_lower)
            measurable_score = sum(1 for pattern in measurable_patterns
                                 if re.search(pattern, story_lower))

            combined_score = (testable_score * 0.4) + (measurable_score * 0.3)
            return min(combined_score, 1.0)

        elif element == 'estimable':
            complexity_indicators = ['simple', 'easy', 'straightforward', 'basic', 'quick']
            difficulty_indicators = ['complex', 'difficult', 'comprehensive', 'detailed', 'advanced']

            simple_count = sum(1 for word in complexity_indicators if word in story_lower)
            complex_count = sum(1 for word in difficulty_indicators if word in story_lower)

            # Stories with clear complexity indicators are more estimable
            if simple_count > 0 or complex_count > 0:
                return 0.8

            # Medium estimability for average stories
            word_count = len(story.split())
            if 10 <= word_count <= 30:
                return 0.6
            else:
                return 0.4

        else:
            return 0.5  # Default score

    def extract_enhanced_features(self, story):
        """Extract comprehensive features with confidence scores"""
        features = {}
        confidence_scores = {}

        for element in self.elements:
            score = self.calculate_pattern_score(story, element)
            features[element] = score

            # Calculate confidence based on score and story characteristics
            confidence = self._calculate_confidence(story, element, score)
            confidence_scores[f'{element}_confidence'] = confidence

        # Add metadata features
        features.update({
            'story_length': len(story.split()),
            'story_char_length': len(story),
            'has_so_that': 1 if 'so that' in story.lower() else 0,
            'has_as_a': 1 if re.search(r'as an?\s', story.lower()) else 0,
            'has_i_want': 1 if 'i want' in story.lower() else 0,
            'sentence_count': len([s for s in story.split('.') if s.strip()]),
            'complexity_score': self._calculate_complexity(story)
        })

        # Add confidence scores
        features.update(confidence_scores)

        return features

    def _calculate_confidence(self, story, element, score):
        """Calculate confidence score for the element assessment"""
        # Base confidence on score extremes
        if score >= 0.8 or score <= 0.2:
            base_confidence = 0.9
        elif 0.4 <= score <= 0.6:
            base_confidence = 0.6
        else:
            base_confidence = 0.75

        # Adjust based on story length and clarity
        word_count = len(story.split())
        if word_count < 5:
            base_confidence *= 0.7
        elif word_count > 50:
            base_confidence *= 0.8

        return min(base_confidence, 1.0)

    def _calculate_complexity(self, story):
        """Calculate overall story complexity"""
        factors = {
            'length': len(story.split()) / 50,  # Normalize to 50 words
            'conjunctions': story.lower().count(' and ') + story.lower().count(' or '),
            'conditions': story.lower().count(' if ') + story.lower().count(' when '),
            'technical_terms': len(re.findall(r'\b(system|database|api|interface|algorithm)\b', story.lower()))
        }

        complexity = sum(min(value, 1.0) for value in factors.values()) / len(factors)
        return min(complexity, 1.0)

    def quantify_stories(self, df):
        """Quantify all stories with enhanced features"""
        logger.info("Starting comprehensive user story quantification...")

        quantified_data = []

        for idx, row in df.iterrows():
            story = row['user_story']
            features = self.extract_enhanced_features(story)

            # Combine with original data
            quantified_row = {
                'filename': row['filename'],
                'user_story': story,
                'split': row['split'],
                'story_id': row.get('story_id', f'story_{idx}')
            }
            quantified_row.update(features)
            quantified_data.append(quantified_row)

        quantified_df = pd.DataFrame(quantified_data)

        # Save with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        quant_filename = f'quantified_user_stories_{timestamp}.csv'
        quantified_df.to_csv(quant_filename, index=False)

        logger.info(f"Quantified {len(quantified_df)} user stories")
        logger.info(f"Saved to {quant_filename}")

        return quantified_df

class ImprovedBERTClassifier(nn.Module):
    """
    Enhanced BERT classifier with dropout and regularization
    """

    def __init__(self, model_name='distilbert-base-uncased', num_classes=14, dropout_rate=0.3):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate * 0.5)

        # Multi-layer classifier
        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size // 2, num_classes)
        )

        # Initialize weights
        self._init_weights()

    def _init_weights(self):
        """Initialize classifier weights"""
        for module in self.classifier:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.zeros_(module.bias)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Use mean pooling over sequence length
        last_hidden_state = outputs.last_hidden_state
        pooled_output = torch.mean(last_hidden_state, dim=1)

        # Apply dropout
        pooled_output = self.dropout1(pooled_output)

        # Classification
        logits = self.classifier(pooled_output)
        return torch.sigmoid(logits)

class EnhancedUserStoryPipeline:
    """
    Comprehensive pipeline with advanced features and metrics
    """

    def __init__(self, train_folder, test_folder):
        self.train_folder = train_folder
        self.test_folder = test_folder
        self.processor = EnhancedUserStoryDataProcessor(train_folder, test_folder)
        self.augmenter = AdvancedDataAugmenter()
        self.quantifier = ComprehensiveUserStoryQuantifier()
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = None
        self.training_history = []
        self.final_metrics = {}

    def create_balanced_labels(self, quantified_df):
        """Create balanced synthetic labels with adaptive thresholds"""
        logger.info("Creating balanced synthetic labels...")

        label_columns = self.quantifier.elements
        labels = []
        threshold_stats = {}

        # Calculate adaptive thresholds for each element
        adaptive_thresholds = {}
        for element in label_columns:
            scores = quantified_df[element].values
            mean_score = np.mean(scores)
            std_score = np.std(scores)

            # Adaptive threshold based on distribution
            if element in ['well_formed', 'role_identification', 'task_nature']:
                threshold = max(0.3, mean_score - 0.5 * std_score)
            elif element in ['priority', 'quality_requirement', 'dependency']:
                threshold = max(0.7, mean_score + 0.5 * std_score)
            else:
                threshold = mean_score

            adaptive_thresholds[element] = min(max(threshold, 0.1), 0.9)

        # Create labels
        for _, row in quantified_df.iterrows():
            label_row = []
            for element in label_columns:
                score = row[element]
                threshold = adaptive_thresholds[element]
                label = 1 if score > threshold else 0
                label_row.append(label)

                # Track statistics
                if element not in threshold_stats:
                    threshold_stats[element] = {'positive': 0, 'total': 0, 'threshold': threshold}
                threshold_stats[element]['positive'] += label
                threshold_stats[element]['total'] += 1

            labels.append(label_row)

        # Log class distribution
        for element, stats in threshold_stats.items():
            positive_ratio = stats['positive'] / stats['total']
            logger.info(f"{element}: {positive_ratio:.3f} positive ratio (threshold: {stats['threshold']:.3f})")

        return np.array(labels), adaptive_thresholds

    def train_model_with_validation(self, train_stories, train_labels, val_stories, val_labels):
        """Enhanced training with validation and early stopping"""
        logger.info("Starting enhanced BERT training with validation...")

        # Create datasets
        train_dataset = UserStoryDataset(train_stories, train_labels, self.tokenizer)
        val_dataset = UserStoryDataset(val_stories, val_labels, self.tokenizer)

        # Initialize model
        model = ImprovedBERTClassifier()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        logger.info(f"Using device: {device}")

        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

        # Training setup
        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
        criterion = nn.BCELoss()
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.5)

        # Training loop with validation
        best_val_loss = float('inf')
        patience_counter = 0
        max_patience = 5

        for epoch in range(30):
            # Training phase
            model.train()
            train_loss = 0
            train_batches = 0

            for batch in train_loader:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

                train_loss += loss.item()
                train_batches += 1

            avg_train_loss = train_loss / train_batches

            # Validation phase
            model.eval()
            val_loss = 0
            val_batches = 0

            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)

                    val_loss += loss.item()
                    val_batches += 1

            avg_val_loss = val_loss / val_batches

            # Learning rate scheduling
            scheduler.step(avg_val_loss)

            # Early stopping
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                patience_counter = 0
                # Save best model
                torch.save(model.state_dict(), 'best_model.pth')
            else:
                patience_counter += 1

            # Log progress
            logger.info(f"Epoch {epoch+1}/5 - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

            # Store training history
            self.training_history.append({
                'epoch': epoch + 1,
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
                'lr': optimizer.param_groups[0]['lr']
            })

            if patience_counter >= max_patience:
                logger.info(f"Early stopping at epoch {epoch+1}")
                break

        # Load best model
        model.load_state_dict(torch.load('best_model.pth'))
        return model

    def calculate_comprehensive_metrics(self, true_labels, predictions, threshold=0.5):
        """Calculate comprehensive evaluation metrics"""
        logger.info("Calculating comprehensive metrics...")

        # Convert predictions to binary
        binary_predictions = (predictions > threshold).astype(int)

        # Per-element metrics
        element_metrics = {}
        overall_metrics = {'precision': [], 'recall': [], 'f1': [], 'accuracy': []}

        for i, element in enumerate(self.quantifier.elements):
            # Basic metrics
            precision, recall, f1, _ = precision_recall_fscore_support(
                true_labels[:, i],
                binary_predictions[:, i],
                average='binary',
                zero_division=0
            )

            accuracy = accuracy_score(true_labels[:, i], binary_predictions[:, i])

            # Additional metrics
            try:
                auc_score = roc_auc_score(true_labels[:, i], predictions[:, i])
            except ValueError:
                auc_score = 0.0

            # Confusion matrix
            cm = confusion_matrix(true_labels[:, i], binary_predictions[:, i])
            tn, fp, fn, tp = cm.ravel() if cm.size == 4 else (0, 0, 0, 0)

            # Specificity and sensitivity
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

            element_metrics[element] = {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'accuracy': accuracy,
                'auc': auc_score,
                'specificity': specificity,
                'sensitivity': sensitivity,
                'support': np.sum(true_labels[:, i]),
                'true_positives': tp,
                'false_positives': fp,
                'true_negatives': tn,
                'false_negatives': fn
            }

            overall_metrics['precision'].append(precision)
            overall_metrics['recall'].append(recall)
            overall_metrics['f1'].append(f1)
            overall_metrics['accuracy'].append(accuracy)

        # Macro and micro averages
        macro_metrics = {
            'macro_precision': np.mean(overall_metrics['precision']),
            'macro_recall': np.mean(overall_metrics['recall']),
            'macro_f1': np.mean(overall_metrics['f1']),
            'macro_accuracy': np.mean(overall_metrics['accuracy']),
            'macro_auc': np.mean([m['auc'] for m in element_metrics.values()])
        }

        # Micro averages
        all_true = true_labels.flatten()
        all_pred = binary_predictions.flatten()

        micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(
            all_true, all_pred, average='micro', zero_division=0
        )

        micro_metrics = {
            'micro_precision': micro_precision,
            'micro_recall': micro_recall,
            'micro_f1': micro_f1,
            'micro_accuracy': accuracy_score(all_true, all_pred)
        }

        # Weighted averages
        weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(
            all_true, all_pred, average='weighted', zero_division=0
        )

        weighted_metrics = {
            'weighted_precision': weighted_precision,
            'weighted_recall': weighted_recall,
            'weighted_f1': weighted_f1
        }

        return {
            'element_metrics': element_metrics,
            'macro_metrics': macro_metrics,
            'micro_metrics': micro_metrics,
            'weighted_metrics': weighted_metrics,
            'overall_metrics': overall_metrics
        }

    def run_comprehensive_pipeline(self):
        """Execute the complete enhanced pipeline"""
        logger.info(" Starting Comprehensive User Story Quantification Pipeline")
        print("=" * 80)

        try:
            # Step 1: Enhanced Data Processing
            logger.info(" Step 1: Enhanced Dataset Processing...")
            df = self.processor.convert_to_csv()

            if len(df) == 0:
                logger.error("No data found! Check folder paths.")
                return None, None

            # Step 2: Advanced Data Augmentation
            logger.info(" Step 2: Advanced Data Augmentation...")
            augmented_df = self.augmenter.augment_data(df, augmentation_factor=2)

            # Step 3: Comprehensive Quantification
            logger.info(" Step 3: Comprehensive User Story Quantification...")
            quantified_df = self.quantifier.quantify_stories(augmented_df)

            # Step 4: Prepare Enhanced Training Data
            logger.info(" Step 4: Preparing Enhanced Training Data...")
            train_df = quantified_df[quantified_df['split'] == 'train'].copy()
            test_df = quantified_df[quantified_df['split'] == 'test'].copy()

            # Create balanced labels
            all_labels, thresholds = self.create_balanced_labels(quantified_df)
            train_indices = train_df.index
            test_indices = test_df.index

            train_labels = all_labels[train_indices]
            test_labels = all_labels[test_indices]

            # Split training data for validation
            train_stories = train_df['user_story'].tolist()
            test_stories = test_df['user_story'].tolist()

            # Create validation split
            train_stories_split, val_stories, train_labels_split, val_labels = train_test_split(
                train_stories, train_labels, test_size=0.2, random_state=42, stratify=train_labels[:, 0]
            )

            # Step 5: Enhanced Model Training
            logger.info(" Step 5: Enhanced BERT Model Training...")
            self.model = self.train_model_with_validation(
                train_stories_split, train_labels_split, val_stories, val_labels
            )

            # Step 6: Generate Predictions
            logger.info(" Step 6: Generating Predictions...")
            predictions = self.predict_with_bert(self.model, test_stories)

            # Step 7: Comprehensive Evaluation
            logger.info(" Step 7: Comprehensive Model Evaluation...")
            metrics = self.calculate_comprehensive_metrics(test_labels, predictions)

            # Step 8: Save Enhanced Results
            logger.info(" Step 8: Saving Enhanced Results...")
            self.save_comprehensive_results(test_df, predictions, test_labels, metrics, thresholds)

            # Store final metrics
            self.final_metrics = metrics

            logger.info(" Pipeline completed successfully!")
            return quantified_df, metrics

        except Exception as e:
            logger.error(f"Pipeline failed: {e}")
            raise

    def predict_with_bert(self, model, stories):
        """Generate predictions with batch processing"""
        logger.info("Generating predictions with enhanced BERT model...")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.eval()
        predictions = []

        batch_size = 32
        with torch.no_grad():
            for i in range(0, len(stories), batch_size):
                batch_stories = stories[i:i+batch_size]

                # Tokenize batch
                encodings = self.tokenizer(
                    batch_stories,
                    truncation=True,
                    padding='max_length',
                    max_length=512,
                    return_tensors='pt'
                )

                input_ids = encodings['input_ids'].to(device)
                attention_mask = encodings['attention_mask'].to(device)

                outputs = model(input_ids, attention_mask)
                predictions.extend(outputs.cpu().numpy())

        return np.array(predictions)

    def save_comprehensive_results(self, test_df, predictions, true_labels, metrics, thresholds):
        """Save comprehensive results with visualizations"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save detailed predictions
        results_df = test_df.copy()

        for i, element in enumerate(self.quantifier.elements):
            results_df[f'{element}_predicted'] = predictions[:, i]
            results_df[f'{element}_true_label'] = true_labels[:, i]
            results_df[f'{element}_binary_pred'] = (predictions[:, i] > 0.5).astype(int)
            results_df[f'{element}_threshold'] = thresholds[element]

        results_df.to_csv(f'comprehensive_results_{timestamp}.csv', index=False)

        # Save metrics
        metrics_df = pd.DataFrame(metrics['element_metrics']).T
        metrics_df.to_csv(f'comprehensive_metrics_{timestamp}.csv')

        # Create comprehensive report
        self.generate_comprehensive_report(metrics, timestamp)

        # Save training history
        if self.training_history:
            history_df = pd.DataFrame(self.training_history)
            history_df.to_csv(f'training_history_{timestamp}.csv', index=False)

        logger.info(f"Comprehensive results saved with timestamp: {timestamp}")

    def generate_comprehensive_report(self, metrics, timestamp):
        """Generate detailed evaluation report"""
        report = f"""
{'='*80}
COMPREHENSIVE USER STORY QUANTIFICATION EVALUATION REPORT
{'='*80}
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

DATASET OVERVIEW:
- Training Stories: {len([h for h in self.training_history if 'train_loss' in h])}
- Test Stories Evaluated: {len(metrics['element_metrics'])}
- Total Elements Evaluated: {len(self.quantifier.elements)}

OVERALL PERFORMANCE METRICS:
{'='*50}
Macro Averages:
  • Precision: {metrics['macro_metrics']['macro_precision']:.4f}
  • Recall: {metrics['macro_metrics']['macro_recall']:.4f}
  • F1-Score: {metrics['macro_metrics']['macro_f1']:.4f}
  • Accuracy: {metrics['macro_metrics']['macro_accuracy']:.4f}
  • AUC: {metrics['macro_metrics']['macro_auc']:.4f}

Micro Averages:
  • Precision: {metrics['micro_metrics']['micro_precision']:.4f}
  • Recall: {metrics['micro_metrics']['micro_recall']:.4f}
  • F1-Score: {metrics['micro_metrics']['micro_f1']:.4f}
  • Accuracy: {metrics['micro_metrics']['micro_accuracy']:.4f}

Weighted Averages:
  • Precision: {metrics['weighted_metrics']['weighted_precision']:.4f}
  • Recall: {metrics['weighted_metrics']['weighted_recall']:.4f}
  • F1-Score: {metrics['weighted_metrics']['weighted_f1']:.4f}

DETAILED PER-ELEMENT PERFORMANCE:
{'='*50}"""

        for element, metric in metrics['element_metrics'].items():
            report += f"""
{element.replace('_', ' ').title()}:
  • Precision: {metric['precision']:.4f}
  • Recall: {metric['recall']:.4f}
  • F1-Score: {metric['f1']:.4f}
  • Accuracy: {metric['accuracy']:.4f}
  • AUC: {metric['auc']:.4f}
  • Specificity: {metric['specificity']:.4f}
  • Sensitivity: {metric['sensitivity']:.4f}
  • Support: {metric['support']}
  • Confusion Matrix: TP={metric['true_positives']}, FP={metric['false_positives']}, TN={metric['true_negatives']}, FN={metric['false_negatives']}
"""

        # Element ranking
        report += f"""
ELEMENT PERFORMANCE RANKING:
{'='*50}
By F1-Score:"""

        sorted_elements = sorted(metrics['element_metrics'].items(),
                               key=lambda x: x[1]['f1'], reverse=True)

        for i, (element, metric) in enumerate(sorted_elements, 1):
            report += f"""
{i:2d}. {element.replace('_', ' ').title()}: {metric['f1']:.4f}"""

        report += f"""

By AUC Score:"""

        sorted_by_auc = sorted(metrics['element_metrics'].items(),
                              key=lambda x: x[1]['auc'], reverse=True)

        for i, (element, metric) in enumerate(sorted_by_auc, 1):
            report += f"""
{i:2d}. {element.replace('_', ' ').title()}: {metric['auc']:.4f}"""

        # Performance analysis
        report += f"""

PERFORMANCE ANALYSIS:
{'='*50}
High Performing Elements (F1 > 0.7):"""

        high_performers = [elem for elem, metrics in metrics['element_metrics'].items()
                          if metrics['f1'] > 0.7]
        if high_performers:
            for elem in high_performers:
                report += f"""
  • {elem.replace('_', ' ').title()}: {metrics['element_metrics'][elem]['f1']:.4f}"""
        else:
            report += "\n  • None"

        report += f"""

Medium Performing Elements (0.3 < F1 <= 0.7):"""

        medium_performers = [elem for elem, metrics in metrics['element_metrics'].items()
                           if 0.3 < metrics['f1'] <= 0.7]
        if medium_performers:
            for elem in medium_performers:
                report += f"""
  • {elem.replace('_', ' ').title()}: {metrics['element_metrics'][elem]['f1']:.4f}"""
        else:
            report += "\n  • None"

        report += f"""

Low Performing Elements (F1 <= 0.3):"""

        low_performers = [elem for elem, metrics in metrics['element_metrics'].items()
                         if metrics['f1'] <= 0.3]
        if low_performers:
            for elem in low_performers:
                report += f"""
  • {elem.replace('_', ' ').title()}: {metrics['element_metrics'][elem]['f1']:.4f}"""
        else:
            report += "\n  • None"

        # Training information
        if self.training_history:
            final_epoch = self.training_history[-1]
            report += f"""

TRAINING INFORMATION:
{'='*50}
  • Total Epochs: {len(self.training_history)}
  • Final Training Loss: {final_epoch['train_loss']:.4f}
  • Final Validation Loss: {final_epoch['val_loss']:.4f}
  • Final Learning Rate: {final_epoch['lr']:.2e}"""

        report += f"""

RECOMMENDATIONS:
{'='*50}"""

        if len(high_performers) >= len(self.quantifier.elements) * 0.5:
            report += """
   Model shows strong performance on majority of elements
  • Consider deploying for production use
  • Focus on improving low-performing elements"""
        elif len(medium_performers) >= len(self.quantifier.elements) * 0.5:
            report += """
    Model shows moderate performance
  • Consider additional training data
  • Experiment with different architectures
  • Review feature engineering for low-performing elements"""
        else:
            report += """
   Model needs significant improvement
  • Increase training data size
  • Review labeling strategy
  • Consider ensemble methods
  • Analyze class imbalance issues"""

        report += f"""

FILES GENERATED:
{'='*50}
   comprehensive_results_{timestamp}.csv
   comprehensive_metrics_{timestamp}.csv
   comprehensive_report_{timestamp}.txt"""

        if self.training_history:
            report += f"""
   training_history_{timestamp}.csv"""

        report += f"""
   best_model.pth

{'='*80}
END OF REPORT
{'='*80}
"""

        # Save report
        with open(f'comprehensive_report_{timestamp}.txt', 'w') as f:
            f.write(report)

        # Print summary to console
        print(report)

        return report

class UserStoryDataset(Dataset):
    """Enhanced PyTorch Dataset with better error handling"""

    def __init__(self, stories, labels, tokenizer, max_length=512):
        self.stories = stories
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Validate inputs
        assert len(stories) == len(labels), "Stories and labels must have same length"

    def __len__(self):
        return len(self.stories)

    def __getitem__(self, idx):
        story = str(self.stories[idx])

        try:
            encoding = self.tokenizer(
                story,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.FloatTensor(self.labels[idx])
            }
        except Exception as e:
            logger.error(f"Error processing story at index {idx}: {e}")
            # Return a default encoding
            return {
                'input_ids': torch.zeros(self.max_length, dtype=torch.long),
                'attention_mask': torch.zeros(self.max_length, dtype=torch.long),
                'labels': torch.FloatTensor(self.labels[idx])
            }

def main():
    """Enhanced main function with comprehensive error handling"""

    # Configuration
    train_folder = "/content/drive/MyDrive/User Stories Dataset/User Stories Train"
    test_folder = "/content/drive/MyDrive/User Stories Dataset/User Stories Test"

    print("COMPREHENSIVE USER STORY QUANTIFICATION PIPELINE")
    print("=" * 80)
    print(f"Train folder: {train_folder}")
    print(f"Test folder: {test_folder}")
    print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Validate paths
    if not os.path.exists(train_folder):
        logger.error(f"Train folder not found: {train_folder}")
        print("Please mount Google Drive and check paths")
        return

    if not os.path.exists(test_folder):
        logger.error(f"Test folder not found: {test_folder}")
        print("Please mount Google Drive and check paths")
        return

    # Initialize and run pipeline
    pipeline = EnhancedUserStoryPipeline(train_folder, test_folder)

    try:
        start_time = datetime.now()
        quantified_df, metrics = pipeline.run_comprehensive_pipeline()
        end_time = datetime.now()

        if quantified_df is not None and metrics is not None:
            print("\n" + "="*80)
            print(" PIPELINE COMPLETED SUCCESSFULLY!")
            print("="*80)

            # Display final metrics summary
            print("\n FINAL METRICS SUMMARY:")
            print("-" * 50)
            print(f"Macro F1-Score: {metrics['macro_metrics']['macro_f1']:.4f}")
            print(f"Macro Precision: {metrics['macro_metrics']['macro_precision']:.4f}")
            print(f"Macro Recall: {metrics['macro_metrics']['macro_recall']:.4f}")
            print(f"Macro Accuracy: {metrics['macro_metrics']['macro_accuracy']:.4f}")
            print(f"Macro AUC: {metrics['macro_metrics']['macro_auc']:.4f}")

            print(f"\nMicro F1-Score: {metrics['micro_metrics']['micro_f1']:.4f}")
            print(f"Micro Precision: {metrics['micro_metrics']['micro_precision']:.4f}")
            print(f"Micro Recall: {metrics['micro_metrics']['micro_recall']:.4f}")

            print(f"\nWeighted F1-Score: {metrics['weighted_metrics']['weighted_f1']:.4f}")

            # Top performing elements
            sorted_elements = sorted(metrics['element_metrics'].items(),
                                   key=lambda x: x[1]['f1'], reverse=True)

            print(f"\n TOP 5 PERFORMING ELEMENTS:")
            print("-" * 50)
            for i, (element, metric) in enumerate(sorted_elements[:5], 1):
                print(f"{i}. {element.replace('_', ' ').title()}: {metric['f1']:.4f}")

            # Execution time
            execution_time = end_time - start_time
            print(f"\n  Total Execution Time: {execution_time}")

            print("\n" + "="*80)

        else:
            print(" Pipeline failed - no results generated")

    except Exception as e:
        logger.error(f"Pipeline execution failed: {e}")
        print(f" Pipeline failed with error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()



Downloading NLTK resources...
COMPREHENSIVE USER STORY QUANTIFICATION PIPELINE
Train folder: /content/drive/MyDrive/User Stories Dataset/User Stories Train
Test folder: /content/drive/MyDrive/User Stories Dataset/User Stories Test
Started at: 2025-08-04 18:54:01


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


COMPREHENSIVE USER STORY QUANTIFICATION EVALUATION REPORT
Generated: 2025-08-04 19:21:54

DATASET OVERVIEW:
- Training Stories: 21
- Test Stories Evaluated: 14
- Total Elements Evaluated: 14

OVERALL PERFORMANCE METRICS:
Macro Averages:
  • Precision: 0.6529
  • Recall: 0.5972
  • F1-Score: 0.6176
  • Accuracy: 0.9572
  • AUC: nan

Micro Averages:
  • Precision: 0.9572
  • Recall: 0.9572
  • F1-Score: 0.9572
  • Accuracy: 0.9572

Weighted Averages:
  • Precision: 0.9570
  • Recall: 0.9572
  • F1-Score: 0.9570

DETAILED PER-ELEMENT PERFORMANCE:
Task Identification:
  • Precision: 0.9536
  • Recall: 0.9585
  • F1-Score: 0.9561
  • Accuracy: 0.9735
  • AUC: 0.9876
  • Specificity: 0.9800
  • Sensitivity: 0.9585
  • Support: 193
  • Confusion Matrix: TP=185, FP=9, TN=440, FN=8

Task Nature:
  • Precision: 0.9836
  • Recall: 0.9449
  • F1-Score: 0.9639
  • Accuracy: 0.9720
  • AUC: 0.9951
  • Specificity: 0.9897
  • Sensitivity: 0.9449
  • Support: 254
  • Confusion Matrix: TP=240, FP=4, T

In [None]:
# Assuming 'metrics' is available from the pipeline execution
# and contains 'element_metrics' with confusion matrix details

print("Confusion Matrices for Each Element:")
print("=" * 40)

for element, metric in pipeline.final_metrics['element_metrics'].items():
    # Reconstruct the confusion matrix from the metrics dictionary
    tn = metric['true_negatives']
    fp = metric['false_positives']
    fn = metric['false_negatives']
    tp = metric['true_positives']

    cm = np.array([[tn, fp], [fn, tp]])

    print(f"\n--- {element.replace('_', ' ').title()} ---")
    print(cm)

    # Visualize the confusion matrix
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['Actual Negative', 'Actual Positive'])
    plt.title(f'Confusion Matrix for {element.replace("_", " ").title()}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

Confusion Matrices for Each Element:


NameError: name 'pipeline' is not defined