In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

In [12]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

def clean_text(text):
    """Basic text cleaning function"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s-]', '', text)  # Keep hyphens
    text = ' '.join(text.split())  # Remove extra spaces
    return text

def extract_key_phrases(text, common_terms):
    """Extract meaningful 2-4 word phrases"""
    words = [w for w in clean_text(text).split() 
             if w not in common_terms and len(w) > 2]
    
    # Create 2-4 word phrases
    phrases = []
    for i in range(len(words)-1):
        phrases.append(' '.join(words[i:i+2]))
        if i < len(words)-2:
            phrases.append(' '.join(words[i:i+3]))
        if i < len(words)-3:
            phrases.append(' '.join(words[i:i+4]))
    
    return phrases

def identify_module(text, common_terms, module_keywords):
    """Identify the most relevant module"""
    text = clean_text(text)
    
    # First check for exact module matches
    for module, keywords in module_keywords.items():
        if any(keyword in text for keyword in keywords):
            return module
    
    # Then check for partial matches
    for module, keywords in module_keywords.items():
        if any(keyword.split()[0] in text for keyword in keywords):
            return module
    
    return "Other"

def get_concise_context(row, common_terms, module_keywords):
    """Get 2-4 word context and module"""
    # Priority fields for context
    summary = str(row['Summary']) if pd.notna(row['Summary']) else ""
    parent_summary = str(row['Parent summary']) if 'Parent summary' in row and pd.notna(row['Parent summary']) else ""
    
    # Extract phrases from both fields
    summary_phrases = extract_key_phrases(summary, common_terms)
    parent_phrases = extract_key_phrases(parent_summary, common_terms)
    
    all_phrases = summary_phrases + parent_phrases
    
    if not all_phrases:
        return "Not clear", "Other"
    
    # Get most common phrase
    context = Counter(all_phrases).most_common(1)[0][0]
    
    # Identify module
    combined_text = f"{context} {parent_summary}".strip()
    module = identify_module(combined_text, common_terms, module_keywords)
    
    return context, module

def group_by_module(df):
    """Group issues by identified module"""
    # Create numeric groups for each module
    modules = df['Module'].unique()
    module_to_group = {module: i+1 for i, module in enumerate(modules)}
    df['Module_Group'] = df['Module'].map(module_to_group)
    return df

def process_jira_file(input_path, output_path):
    """Main processing function"""
    # Common terms that don't represent meaningful context
    common_terms = {
        'the', 'and', 'for', 'with', 'this', 'that', 'issue', 'bug', 
        'fix', 'error', 'problem', 'request', 'ticket', 'work', 'task',
        'story', 'be', 'fe', 'web', 'sms', 'email', 'high', 'open', 'uat',
        'configuration', 'handling', 'check', 'flow', 'terms', 'conditions'
    }
    
    # Define known modules and their keywords
    module_keywords = {
        "Onboarding": ["onboard", "kyc", "registration", "signup"],
        "Trade": ["trade", "import", "export", "lc", "letter of credit"],
        "Trade & Remittance": ["remittance", "inward", "outward", "forex"],
        "Payments": ["payment", "settle", "invoice", "collection"],
        "Martech": ["martech", "moengage", "events", "integration"],
        "IRM": ["irm", "stp", "scrutiny", "purpose code"],
        "Indie Business": ["indie", "business", "ifb", "sms", "email"]
    }
    
    # Read Excel file
    try:
        df = pd.read_excel(input_path)
    except Exception as e:
        print(f"Error reading file: {e}")
        return
    
    # Verify required columns exist
    if 'Summary' not in df.columns:
        print("Error: 'Summary' column is required")
        return
    
    # Add Context and Module columns
    df['Context'], df['Module'] = zip(*df.apply(
        lambda row: get_concise_context(row, common_terms, module_keywords), axis=1))
    
    # Group by module
    df = group_by_module(df)
    
    # Sort by module group and context
    df_sorted = df.sort_values(['Module_Group', 'Context'])
    
    # Save results
    try:
        df_sorted.to_excel(output_path, index=False)
        print(f"Success! Processed data saved to {output_path}")
        print(f"Total issues processed: {len(df)}")
        print(f"Modules identified: {', '.join(df['Module'].unique())}")
    except Exception as e:
        print(f"Error saving results: {e}")


In [13]:
# Example usage
if __name__ == "__main__":
    input_file = r"c:\Users\HomeSmiles\Downloads\IFB-MSME-Sprint.xlsx"  # Your input file
    output_file = r"c:\Users\HomeSmiles\Downloads\grouped_jira_issues.xlsx"  # Output file
    process_jira_file(input_file, output_file)

Success! Processed data saved to c:\Users\HomeSmiles\Downloads\grouped_jira_issues.xlsx
Total issues processed: 259
Modules identified: Payments, Other, Martech, Indie Business, IRM, Onboarding, Trade, Trade & Remittance


### NLP

In [1]:
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from textblob import TextBlob

In [4]:
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

# Load English language model
try:
    nlp = spacy.load('en_core_web_md')
except:
    import subprocess
    subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_md'])
    nlp = spacy.load('en_core_web_md')

class JIRAAnalyzer:
    def __init__(self):
        self.common_terms = {
            'the', 'and', 'for', 'with', 'this', 'that', 'issue', 'bug',
            'fix', 'error', 'problem', 'request', 'ticket', 'work', 'task',
            'story', 'be', 'fe', 'web', 'high', 'open', 'uat', 'system'
        }
        
        # Domain-agnostic module template (customize as needed)
        self.module_keywords = {
            "Core Module 1": ["primary function 1", "system component 1"],
            "Core Module 2": ["primary function 2", "system component 2"],
            "Integration": ["api integration", "system connection"],
            "Security": ["access control", "authentication"],
            "UI/UX": ["user interface", "screen design"]
        }

    def set_domain(self, domain_name, module_specs):
        """Configure for specific domain (e.g., banking, e-commerce)"""
        self.domain = domain_name
        self.module_keywords = module_specs
        
        # Add domain-specific common terms
        if domain_name.lower() == "banking":
            self.common_terms.update({
                'payment', 'transaction', 'account', 'customer', 'bank'
            })
        elif domain_name.lower() == "ecommerce":
            self.common_terms.update({
                'product', 'cart', 'checkout', 'inventory', 'order'
            })

    def clean_text(self, text):
        """Domain-agnostic text cleaning"""
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s-]', '', text)
        return ' '.join(text.split())

    def extract_phrases(self, text):
        """Extract 2-4 word phrases with NLP"""
        doc = nlp(text)
        phrases = set()
        
        # Extract noun chunks (most descriptive)
        for chunk in doc.noun_chunks:
            words = [w.text for w in chunk if not w.is_stop and len(w.text) > 2]
            if 2 <= len(words) <= 4:
                phrases.add(' '.join(words))
        
        # Fallback to important entities
        if not phrases:
            for ent in doc.ents:
                if 2 <= len(ent.text.split()) <= 4:
                    phrases.add(ent.text.lower())
        
        # Final fallback to key words
        if not phrases:
            words = [token.text for token in doc 
                    if not token.is_stop and len(token.text) > 2]
            for i in range(len(words)-1):
                if i+3 < len(words):
                    phrases.add(' '.join(words[i:i+4]))
                elif i+2 < len(words):
                    phrases.add(' '.join(words[i:i+3]))
                else:
                    phrases.add(' '.join(words[i:i+2]))
        
        return [p for p in phrases if 2 <= len(p.split()) <= 4]

    def identify_module(self, text):
        """Flexible module identification"""
        doc = nlp(text)
        best_match = ("Other", 0)
        
        for module, keywords in self.module_keywords.items():
            for keyword in keywords:
                similarity = doc.similarity(nlp(keyword))
                if similarity > best_match[1]:
                    best_match = (module, similarity)
        
        return best_match[0] if best_match[1] > 0.7 else "Other"

    def analyze_row(self, row):
        """Process single JIRA row"""
        summary = self.clean_text(row['Summary'])
        parent = self.clean_text(row.get('Parent Summary', ''))
        
        # Extract context (guaranteed 2-4 words)
        phrases = self.extract_phrases(f"{summary} {parent}")
        context = max(phrases, key=len) if phrases else "Not clear"
        
        # Identify module
        module = self.identify_module(f"{context} {parent}")
        
        return context[:100], module  # Ensure reasonable length

    def process_file(self, input_path, output_path):
        """End-to-end processing"""
        try:
            df = pd.read_excel(input_path)
            
            # Add analysis columns
            df['Context'], df['Module'] = zip(*df.apply(
                lambda row: self.analyze_row(row), axis=1))
            
            # Group similar items
            self._group_similar_issues(df)
            
            # Save results
            df.to_excel(output_path, index=False)
            print(f"Analysis complete. Results saved to {output_path}")
            self._print_summary(df)
            
        except Exception as e:
            print(f"Error: {str(e)}")

    def _group_similar_issues(self, df):
        """Group issues by context similarity"""
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf = vectorizer.fit_transform(df['Context'])
        
        df['Similarity_Group'] = 0
        group_id = 1
        
        for i in range(len(df)):
            if df.at[i, 'Similarity_Group'] == 0:
                df.at[i, 'Similarity_Group'] = group_id
                for j in range(i+1, len(df)):
                    if cosine_similarity(tfidf[i], tfidf[j])[0][0] > 0.65:
                        df.at[j, 'Similarity_Group'] = group_id
                group_id += 1

    def _print_summary(self, df):
        """Print analysis summary"""
        print("\n=== Analysis Summary ===")
        print(f"Total Issues: {len(df)}")
        print("\nModule Distribution:")
        print(df['Module'].value_counts())
        
        if "Other" in df['Module'].values:
            print("\nSample 'Other' Contexts:")
            print(df[df['Module'] == "Other"]['Context'].head(3).to_string(index=False))

# Example configuration for banking domain
banking_modules = {
    "Positive Pay": ["check fraud prevention", "payee verification"],
    "Loan Processing": ["loan approval", "credit assessment"],
    "Digital Banking": ["mobile app", "online banking"],
    "Compliance": ["aml screening", "kyc verification"]
}



In [7]:
# Usage Example:
if __name__ == "__main__":
    analyzer = JIRAAnalyzer()
    
    # Configure for banking domain (Positive Pay is just one module)
    analyzer.set_domain("banking", banking_modules)
    
    # Process files
    analyzer.process_file(
        input_path = r"c:\Users\HomeSmiles\Downloads\IFB-MSME-Sprint.xlsx",
        output_path = r"c:\Users\HomeSmiles\Downloads\jira_nlp_enhanced.xlsx"
    )

  similarity = doc.similarity(nlp(keyword))


Analysis complete. Results saved to c:\Users\HomeSmiles\Downloads\jira_nlp_enhanced.xlsx

=== Analysis Summary ===
Total Issues: 259

Module Distribution:
Module
Other              212
Positive Pay        31
Digital Banking     16
Name: count, dtype: int64

Sample 'Other' Contexts:
       sorted order
addmanage bene list
   trade remittance
