<a href="https://colab.research.google.com/github/siddhamapple/instruction_tuned_model/blob/main/run_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, DistilBertTokenizer, DistilBertModel
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import re
import os
import numpy as np
import time
import gdown
import logging
import networkx as nx

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [None]:

# Set environment variable to avoid tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Blog Classifier Model
class BlogClassifier(nn.Module):
    def __init__(self, pretrained_model_name="distilbert-base-uncased", num_classes=4):
        super(BlogClassifier, self).__init__()
        self.base_model = DistilBertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.1)

        # Custom classification head with intermediate layer
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
class BlogAnalysisPipeline:
    def __init__(self):
        """Initialize the complete blog analysis pipeline with error handling."""
        logger.info("Initializing Blog Analysis Pipeline...")

        try:
            # Determine device with error handling
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            logger.info(f"Using device: {self.device}")

            # Create directories for models
            os.makedirs("models", exist_ok=True)

            # Download models from Google Drive
            self._download_models()

            # Initialize the blog classifier with error handling
            self.classifier, self.bert_tokenizer = self._load_classifier()
            self.categories = ['World', 'Sports', 'Business', 'Sci/Tech']

            # Initialize the instruction model with error handling
            self.tokenizer, self.model = self._load_instruction_model()

            # Initialize the mind map generator
            self.setup_mindmap_generator()

            logger.info("All components loaded successfully!")
        except Exception as e:
            logger.error(f"Error initializing pipeline: {str(e)}")
            raise RuntimeError(f"Failed to initialize pipeline: {str(e)}")

    def _download_models(self):
        """Download models from Google Drive."""
        logger.info("Downloading models from Google Drive...")

        # Download blog classifier model
        blog_classifier_id = "1sS5T8BxDcQa6vVjXNk4C3ZTo-8VLyLrJ"
        blog_classifier_path = "./blog_classifier_model.pt"

        # Download instruction model
        instruction_model_id = "1_1ssYCWkpyvH4w0EAP44LNkncXxCda21"
        instruction_model_path = "./model.pth"

        try:
            if not os.path.exists(blog_classifier_path):
                logger.info("Downloading blog classifier model...")
                gdown.download(f"https://drive.google.com/uc?id={blog_classifier_id}", blog_classifier_path, quiet=False)

            if not os.path.exists(instruction_model_path):
                logger.info("Downloading instruction model...")
                gdown.download(f"https://drive.google.com/uc?id={instruction_model_id}", instruction_model_path, quiet=False)
        except Exception as e:
            logger.error(f"Error downloading models: {str(e)}")
            raise RuntimeError(f"Failed to download models: {str(e)}")

    def _load_classifier(self):
        """Load the blog classifier model."""
        try:
            logger.info("Loading blog classifier model...")

            # Initialize tokenizer
            bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

            # Initialize model
            classifier = BlogClassifier(num_classes=4).to(self.device)

            # Load model weights
            model_path = "./blog_classifier_model.pt"
            if os.path.exists(model_path):
                classifier.load_state_dict(torch.load(model_path, map_location=self.device))
                classifier.eval()
                logger.info(f"Loaded classifier from {model_path}")
            else:
                raise FileNotFoundError(f"Could not find classifier model at {model_path}")

            return classifier, bert_tokenizer
        except Exception as e:
            logger.error(f"Error loading classifier: {str(e)}")
            raise RuntimeError(f"Failed to load classifier: {str(e)}")

    def _load_instruction_model(self):
        """Load the instruction model."""
        try:
            logger.info("Loading instruction model...")

            # Try to load from the downloaded file
            model_path = "./model.pth"
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Could not find instruction model at {model_path}")

            # Load the model using AutoModelForCausalLM
            tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
            model = AutoModelForCausalLM.from_pretrained("distilgpt2")

            # Load the fine-tuned weights
            model.load_state_dict(torch.load(model_path, map_location=self.device))
            model = model.to(self.device)
            model.eval()

            logger.info(f"Loaded instruction model from {model_path}")
            return tokenizer, model
        except Exception as e:
            logger.error(f"Error loading instruction model: {str(e)}")

            # Fallback to a pre-trained model if loading fails
            logger.info("Falling back to pre-trained model...")
            tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
            model = AutoModelForCausalLM.from_pretrained("distilgpt2")
            model = model.to(self.device)
            return tokenizer, model

    def setup_mindmap_generator(self):
        """Set up the mind map generator."""
        try:
            logger.info("Setting up mind map generator...")
            self.vectorizer = TfidfVectorizer(stop_words='english')
        except Exception as e:
            logger.error(f"Error setting up mind map generator: {str(e)}")
            raise RuntimeError(f"Failed to set up mind map generator: {str(e)}")

    def extract_keywords_tfidf(self, text, n=5):
        """Extract keywords from text using TF-IDF."""
        # Clean text
        text = re.sub(r'[^\w\s]', ' ', text.lower())

        try:
            # Fit and transform the text
            tfidf_matrix = self.vectorizer.fit_transform([text])
            feature_names = self.vectorizer.get_feature_names_out()

            # Get top keywords based on TF-IDF scores
            tfidf_scores = zip(feature_names, tfidf_matrix.toarray()[0])
            sorted_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

            # Return top N keywords with scores
            return [(keyword, score) for keyword, score in sorted_keywords[:n]]
        except:
            # Fallback to simple word frequency if TF-IDF fails
            words = text.split()
            stop_words = set(stopwords.words('english'))
            word_freq = {}
            for word in words:
                if len(word) > 3 and word not in stop_words:
                    word_freq[word] = word_freq.get(word, 0) + 1

            sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
            return [(word, freq/len(words)) for word, freq in sorted_words[:n]]

    def classify_blog(self, blog_text):
        """Classify the blog post."""
        if not blog_text or len(blog_text.strip()) < 10:
            raise ValueError("Blog text is too short. Please provide a longer text.")

        try:
            # Tokenize the input text
            encoding = self.bert_tokenizer(
                blog_text,
                truncation=True,
                max_length=128,
                padding='max_length',
                return_tensors='pt'
            )

            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            # Get prediction
            with torch.no_grad():
                logits = self.classifier(input_ids, attention_mask)
                _, predicted = torch.max(logits, 1)
                pred_idx = predicted.item()

            # Return the predicted category
            return self.categories[pred_idx]
        except Exception as e:
            logger.error(f"Error classifying blog: {str(e)}")
            return f"Classification error: {str(e)}"

    def process_instruction(self, blog_text, instruction, max_length=512):
        """Process an instruction on the blog text."""
        if not blog_text or len(blog_text.strip()) < 10:
            raise ValueError("Blog text is too short. Please provide a longer text.")

        try:
            # Format input with instruction template
            prompt = f"""
Below is a blog post. {instruction}

Blog: {blog_text}

Response:
"""

            # Generate response
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Generate with parameters to avoid repetition
            output = self.model.generate(
                **inputs,
                max_length=len(inputs["input_ids"][0]) + max_length,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.5,
                no_repeat_ngram_size=3,
                pad_token_id=self.tokenizer.eos_token_id
            )

            response = self.tokenizer.decode(output[0], skip_special_tokens=True)

            # Extract just the response part
            response_text = response.split("Response:")[-1].strip()

            # Post-process to remove repetitions
            response_text = re.sub(r'(.{30,}?)\1+', r'\1', response_text)

            return response_text
        except Exception as e:
            logger.error(f"Error processing instruction: {str(e)}")
            return f"Processing error: {str(e)}"

    def generate_mindmap(self, text, central_topic, n=5):
        """Generate a mind map from the provided text."""
        # Extract keywords using TF-IDF
        keywords = self.extract_keywords_tfidf(text, n)

        # Create graph
        G = nx.Graph()

        # Add central node (blog category)
        G.add_node(central_topic, size=30, color='#FF5733')  # Vibrant orange for center

        # Add keyword nodes directly connected to the central node
        for i, (keyword, score) in enumerate(keywords):
            # Scale node size based on importance
            node_size = 15 + 10 * score  # Larger nodes for more important keywords

            # Color based on importance
            color_intensity = min(0.9, 0.5 + score * 0.5)  # Between 0.5 and 0.9
            color = plt.cm.Blues(color_intensity)

            G.add_node(keyword, size=node_size, color=color, score=score)
            G.add_edge(central_topic, keyword, weight=score*5)

        return G, keywords

    def visualize_mindmap(self, graph, keywords):
        """Visualize the mind map graph."""
        plt.figure(figsize=(10, 10))

        # Get central node (first node)
        central_node = list(graph.nodes())[0]

        # Create a radial layout
        pos = nx.spring_layout(graph, k=0.5, iterations=50, seed=42)

        # Force central node to center
        pos[central_node] = np.array([0.5, 0.5])

        # Get node attributes
        node_sizes = [graph.nodes[node].get('size', 10)**2 for node in graph.nodes()]
        node_colors = [graph.nodes[node].get('color', 'lightblue') for node in graph.nodes()]

        # Get edge weights for width
        edge_weights = [graph[u][v].get('weight', 1) for u, v in graph.edges()]

        # Draw edges with varying width based on weight
        nx.draw_networkx_edges(
            graph, pos,
            width=[w/5 for w in edge_weights],
            alpha=0.7,
            edge_color='gray'
        )

        # Draw nodes
        nx.draw_networkx_nodes(
            graph, pos,
            node_size=node_sizes,
            node_color=node_colors,
            alpha=0.9
        )

        # Draw labels with different font sizes
        # Central node with larger font
        central_label = {central_node: central_node}
        nx.draw_networkx_labels(graph, pos, labels=central_label, font_size=16, font_weight='bold')

        # Keyword nodes with size based on importance
        keyword_labels = {node: node for node in graph.nodes() if node != central_node}
        nx.draw_networkx_labels(graph, pos, labels=keyword_labels, font_size=12)

        plt.axis('off')
        plt.tight_layout()

        # Add a title
        plt.title(f"Keywords for {central_node}", fontsize=16, pad=20)

        # Add a legend showing top 3 keywords with scores
        legend_text = "Top Keywords:\n" + "\n".join([f"{kw} ({score:.3f})" for kw, score in keywords[:3]])
        plt.figtext(0.02, 0.02, legend_text, fontsize=12, bbox=dict(facecolor='white', alpha=0.8))

        return plt

    def process_blog(self, blog_text, instruction_type="summarize", n=3):
        """Process a blog post with the complete pipeline."""
        # Step 1: Classify the blog
        category = self.classify_blog(blog_text)
        print(f"Blog Category: {category}")

        # Step 2: Process the instruction
        if instruction_type == "summarize":
            instruction = f"Summarize this blog in {n} sentences."
            result = self.process_instruction(blog_text, instruction)
            return {
                "category": category,
                "summary": result
            }
        elif instruction_type == "keywords":
            # For keywords, use our TF-IDF extractor directly
            keywords_with_scores = self.extract_keywords_tfidf(blog_text, n)
            keywords = [kw for kw, _ in keywords_with_scores]

            # Also generate a mindmap visualization for keywords
            graph, kw_with_scores = self.generate_mindmap(blog_text, category, n)
            plt = self.visualize_mindmap(graph, kw_with_scores)

            return {
                "category": category,
                "keywords": keywords,
                "mind_map_plot": plt
            }
        elif instruction_type == "mindmap":
            # Generate a keyword-focused mindmap
            graph, keywords = self.generate_mindmap(blog_text, category, n)
            plt = self.visualize_mindmap(graph, keywords)

            return {
                "category": category,
                "keywords": [kw for kw, _ in keywords],
                "mind_map_plot": plt
            }
        elif instruction_type == "main_points":
            instruction = f"Extract the {n} main points from this blog."
            result = self.process_instruction(blog_text, instruction)
            return {
                "category": category,
                "result": result
            }
        else:
            # Custom instruction
            result = self.process_instruction(blog_text, instruction_type)
            return {
                "category": category,
                "result": result
            }

In [None]:
def create_blog_analyzer():
    # Install gdown if not already installed
    try:
        import gdown
    except ImportError:
        !pip install -q gdown
        import gdown

    # Create the pipeline
    try:
        pipeline = BlogAnalysisPipeline()
        print(" Blog analysis pipeline initialized successfully!")
    except Exception as e:
        print(f" Error initializing pipeline: {str(e)}")
        return

    # Get blog input
    print("\n Enter your blog post below:")
    blog_text = input()

    if len(blog_text.strip()) < 10:
        print(" Blog text is too short. Please provide a longer text.")
        return

    # Classify the blog
    try:
        category = pipeline.classify_blog(blog_text)
        print(f"\n Blog Category: {category}")
    except Exception as e:
        print(f" Error classifying blog: {str(e)}")
        return

    # Ask for instruction type
    print("\n What would you like to do with this blog?")
    print("1. Summarize")
    print("2. Extract keywords")
    print("3. Generate mindmap")
    print("4. Extract main points")
    print("5. Custom instruction")

    choice = input("Enter your choice (1-5): ")

    # Ask for count/length
    if choice in ["1", "2", "3", "4"]:
        count = input("How many items would you like? (default: 3): ")
        try:
            n = int(count) if count.strip() else 3
        except:
            n = 3
            print("Invalid number, using default of 3.")

    # Process based on choice
    try:
        if choice == "1":
            result = pipeline.process_blog(blog_text, "summarize", n)
            print(f"\n Summary:\n{result['summary']}")
        elif choice == "2":
            result = pipeline.process_blog(blog_text, "keywords", n)
            print(f"\n Keywords: {', '.join(result['keywords'])}")
            # Display mindmap
            plt = result['mind_map_plot']
            plt.show()
        elif choice == "3":
            result = pipeline.process_blog(blog_text, "mindmap", n)
            print(f"\n Keywords: {', '.join(result['keywords'])}")
            # Display mindmap
            plt = result['mind_map_plot']
            plt.show()
        elif choice == "4":
            result = pipeline.process_blog(blog_text, "main_points", n)
            print(f"\n Main Points:\n{result['result']}")
        elif choice == "5":
            custom_instruction = input("Enter your custom instruction: ")
            result = pipeline.process_blog(blog_text, custom_instruction)
            print(f"\n Result:\n{result['result']}")
        else:
            print(" Invalid choice.")
    except Exception as e:
        print(f" Error processing blog: {str(e)}")

In [None]:
# Run
if __name__ == "__main__":
    create_blog_analyzer()


Downloading...
From (original): https://drive.google.com/uc?id=1sS5T8BxDcQa6vVjXNk4C3ZTo-8VLyLrJ
From (redirected): https://drive.google.com/uc?id=1sS5T8BxDcQa6vVjXNk4C3ZTo-8VLyLrJ&confirm=t&uuid=8fa80ab3-57f0-4d2e-9313-7f19c73f8f5e
To: /content/blog_classifier_model.pt
100%|██████████| 266M/266M [00:02<00:00, 92.7MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1_1ssYCWkpyvH4w0EAP44LNkncXxCda21
From (redirected): https://drive.google.com/uc?id=1_1ssYCWkpyvH4w0EAP44LNkncXxCda21&confirm=t&uuid=187a72f9-86d7-462c-97de-285d3a727d0e
To: /content/model.pth
100%|██████████| 328M/328M [00:04<00:00, 68.7MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optio

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

 Blog analysis pipeline initialized successfully!

 Enter your blog post below:
IoT stands for Internet of Things. It refers to the interconnectedness of physical devices, such as appliances and vehicles, that are embedded with software, sensors, and connectivity which enables these objects to connect and exchange data. This technology allows for the collection and sharing of data from a vast network of devices, creating opportunities for more efficient and automated systems.  Internet of Things (IoT) is the networking of physical objects that contain electronics embedded within their architecture in order to communicate and sense interactions amongst each other or with respect to the external environment. In the upcoming years, IoT-based technology will offer advanced levels of services and practically change the way people lead their daily lives. Advancements in medicine, power, gene therapies, agriculture, smart cities, and smart homes are just a few of the categorical examples wher

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



 Summary:
Below is a blog post. Summarize this blog in 4 sentences.

Blog: IoT stands for Internet of Things. It refers to the interconnectedness of physical devices, such as appliances and vehicles, that are embedded with software, sensors, and connectivity which enables these objects to connect and exchange data. This technology allows for the collection and sharing of data from a vast network of devices, creating opportunities for more efficient and automated systems.  Internet of Things (IoT) is the networking of physical objects that contain electronics embedded within their architecture in order to communicate and sense interactions amongst each other or with respect to the external environment. In the upcoming years, IoT-based technology will offer advanced levels of services and practically change the way people lead their daily lives. Advancements in medicine, power, gene therapies, agriculture, smart cities, and smart homes are just a few of the categorical examples where Io