In [1]:
import sounddevice as sd
import numpy as np
import queue
import threading
import io
from pydub import AudioSegment
from pydub.effects import normalize

# Create a queue to store audio chunks
q = queue.Queue()
recording = False
audio_frames = []

# Callback function that runs when new audio data is available
def audio_callback(indata, frames, time, status):
    if status:
        print(status)
    q.put(indata.copy())
    audio_frames.append(indata.copy())

# Function to start recording
def start_recording(samplerate=44100, channels=1):  # Changed to mono
    global recording, audio_frames
    
    # Clear previous recording data
    audio_frames = []
    
    # Set recording flag
    recording = True
    
    # Start recording in a separate thread
    def record_thread():
        try:
            # Start the input stream with callback
            with sd.InputStream(samplerate=samplerate, channels=channels, dtype='float32', callback=audio_callback):
                print("Recording started. Use stop_recording() to stop.")
                
                # Keep the stream open until recording is set to False
                while recording:
                    sd.sleep(100)  # Sleep to reduce CPU usage
        except Exception as e:
            print(f"Error in recording: {e}")
    
    # Start the recording thread
    thread = threading.Thread(target=record_thread)
    thread.daemon = True
    thread.start()
    
    return thread

# Function to stop recording and save the audio as MP3
def stop_recording(filename="recording.mp3", samplerate=44100):
    global recording
    
    if not recording:
        print("Not currently recording.")
        return None
    
    # Stop the recording
    recording = False
    
    # Wait for any remaining audio data
    print("Stopping recording...")
    sd.sleep(50)  # Give time for the last audio chunks to be processed
    
    # Save the recording if we have audio frames
    if audio_frames:
        # Concatenate all recorded frames
        recorded_data = np.concatenate(audio_frames)
        
        # Normalize the audio data
        recorded_data = recorded_data / np.max(np.abs(recorded_data))
        
        # Convert float32 (-1 to 1) to int16 (-32768 to 32767)
        recorded_data = (recorded_data * 32767).astype(np.int16)
        
        # Create AudioSegment
        audio = AudioSegment(
            recorded_data.tobytes(),
            frame_rate=samplerate,
            sample_width=2,  # 16-bit
            channels=1  # Mono
        )
        
        # Apply audio enhancements
        audio = audio.compress_dynamic_range()  # Compress dynamic range
        audio = normalize(audio)  # Normalize volume
        audio = audio.high_pass_filter(80)  # Remove low frequency noise
        audio = audio.low_pass_filter(10000)  # Remove high frequency noise
        
        # Export as MP3
        audio.export(filename, format="mp3", bitrate="128k")
        print(f"Enhanced MP3 file saved to {filename}")
        
        return filename
    else:
        print("No audio recorded")
        return None

# Example usage
print("Starting recording. Press Ctrl+C to stop...")
record_thread = start_recording()

# Wait for user to stop recording
input("Press Enter to stop recording...\n")

# Stop recording and save file
mp3_file = stop_recording()

Starting recording. Press Ctrl+C to stop...
Recording started. Use stop_recording() to stop.
Stopping recording...
Enhanced MP3 file saved to recording.mp3


In [3]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa

# Load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None

# Load the audio file
input_speech_path = "/home/prasun/Desktop/ADHYAYAN_MITRA/testing/recording.mp3"
# Using librosa to load the audio file
audio_array, original_sampling_rate = librosa.load(input_speech_path, sr=16000)

# Process the loaded audio data
input_features = processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features

# Generate token ids
predicted_ids = model.generate(input_features)

# Decode the predicted ids to get the transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

[" A lot of people are turned off by learning. And if you don't find learning one of the most exciting things in the world, then the whole game, everything else will fall apart. But it has to be something that excites you. I've said to many people that live in a world where knowledge and skills are the most important thing. That's the goal that you are after."]


# Pipeline

In [1]:
import sys
import os
from pathlib import Path

# Get the current notebook path
notebook_path = Path().resolve()  # Returns: /ADHYAYAN_MITRA/testing

# Add project root to Python path
project_root = (notebook_path.parent).parent  # Goes up to /ADHYAYAN_MITRA
sys.path.append(str(project_root))

# Now import your components
from components.audio_recorder.recorder import AudioRecorder
from components.whisper.whisper_small import AudioTranscriptor

In [2]:
project_root

PosixPath('/home/prasun/Desktop/ADHYAYAN_MITRA')

In [3]:
notebook_path

PosixPath('/home/prasun/Desktop/ADHYAYAN_MITRA/components/transcripter_api')

In [None]:
import sys
import os
from pathlib import Path

# Get the current notebook path
notebook_path = Path().resolve()  # Returns: /ADHYAYAN_MITRA/testing

# Add project root to Python path
project_root = (notebook_path.parent)  # Goes up to /ADHYAYAN_MITRA
sys.path.append(str(project_root))

# Now import your components
from components.audio_recorder.recorder import AudioRecorder
from components.whisper.whisper_small import AudioTranscriptor

recording = input("""
Enter Yes -> If you need to record the audio and then transcribe it.
Enter No -> If you want to transcribe the pre-recorded audio. 
""")

if recording == "Yes":
    # Initialize components
    recorder = AudioRecorder()

    # Start recording
    print("Starting recording...")
    recorder.start_recording()

    input()    
    # Stop and save
    audio_path = recorder.stop_recording()
elif recording == "No":
    files = os.listdir(".")
    if "recording.mp3" in files:
        audio_path = os.path.abspath("recording.mp3")
    else:
        print("Recording needs to be done")

In [14]:
audio_path

'/home/prasun/Desktop/ADHYAYAN_MITRA/components/transcripter_api/recording.mp3'

In [2]:
if audio_path:
    transcriptor = AudioTranscriptor()
    print(f"Saved recording to: {audio_path}")
    print("Transcribing...")
    transcript = (transcriptor.transcribe(audio_path))
else:
    print("Recording failed")

Saved recording to: /home/prasun/Desktop/ADHYAYAN_MITRA/testing/recording.mp3
Transcribing...


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [3]:
transcript

" I've said to many people that live in a world where knowledge and skills are the most important thing. That's the goal that you are after. More than money, you want to be able to do it.  to develop knowledge and skills. And so this person asks, what would be the three or so skills that are the most essential moving forward in the 21st century? And it's not like specific skills that I'm going to talk about like coding or mastery AI or going to some business strategy. Because I think that's not really what the spirit of the question is. In general, there are certain kind of personal skills that you want to develop.  So number one, and this is something that I have in my book, Mastery, you must see being social and getting along with people and cooperating knowing how to work with people as a skill It's not something you were born with. It's not something that some people are good but bad or bad at like anything It is a skill that you develop Being social and knowing how to work with pe

In [16]:
# Specify the file name
file_name = "transcript.txt"

# Use `with` to open the file in write mode
with open(file_name, "w") as file:
    # Write content to the file
    file.write(transcript)

print(f"File '{file_name}' has been created and written successfully.")

File 'transcript.txt' has been created and written successfully.


In [6]:
with open('transcript.txt','r') as file:
    print(file.read())

 I've said to many people that live in a world where knowledge and skills are the most important thing. That's the goal that you are after. More than money, you want to be able to do it.  to develop knowledge and skills. And so this person asks, what would be the three or so skills that are the most essential moving forward in the 21st century? And it's not like specific skills that I'm going to talk about like coding or mastery AI or going to some business strategy. Because I think that's not really what the spirit of the question is. In general, there are certain kind of personal skills that you want to develop.  So number one, and this is something that I have in my book, Mastery, you must see being social and getting along with people and cooperating knowing how to work with people as a skill It's not something you were born with. It's not something that some people are good but bad or bad at like anything It is a skill that you develop Being social and knowing how to work with peo

# pipeline for all in one document parsing to .md

In [None]:
# from docling.document_converter import DocumentConverter
# from pathlib import Path

# # Define the directories
# sources_dir = Path("Docs/")
# save_dir = Path("Docs/save")

# # Create save directory if it doesn't exist
# save_dir.mkdir(parents=True, exist_ok=True)

# # Collect source links or file paths from user input
# source_links = []
# while True:
#     new_link = input("Enter link or file path (type 'done' to finish): ")
#     if new_link.lower() == 'done':
#         break
#     source_links.append(new_link)

# # Combine files from the directory and user-provided links
# all_sources = []

# # Add user-provided links or file paths
# for link in source_links:
#     all_sources.append(link)  # Convert user input to Path objects

# # Process all collected sources
# for i, source in enumerate(all_sources):
#     try:
#         converter = DocumentConverter()
#         result = converter.convert(source)  # Convert the file/link
#         text = result.document.export_to_markdown()

#         # Save the markdown output to a file
#         output_path = save_dir / f"converted_{i}.md"
#         with open(output_path, "w", encoding="utf-8") as file:
#             file.write(text)

#         print(f"Converted {source} to {output_path}")
#     except Exception as e:
#         print(f"Error processing {source}: {e}")


Error processing : [Errno 21] Is a directory: '.'
Error processing : [Errno 21] Is a directory: '.'
Error processing : [Errno 21] Is a directory: '.'
Error processing : [Errno 21] Is a directory: '.'


# Doc pipeline

In [1]:
from docling.document_converter import DocumentConverter
from pathlib import Path

path = Path(input("Enter the path to the document(format PDF, MD, DOCX)"))
if path.is_file() and path.suffix.lower() in ['.pdf', '.md', '.docx']:
    converter = DocumentConverter()
    result = converter.convert(path)
    text = (result.document.export_to_markdown())
else:
    print("The file needs to be PDF, DOCX or MD format.")

In [2]:
print(text)

# Comprehensive Report on System Design

System design is the process of defining the architecture, components, interfaces, and data models for building software systems that meet specific requirements efficiently and effectively. It transforms user requirements into a structured blueprint that guides the development of reliable, scalable, and maintainable software solutions[1].

## Definition and Fundamentals

System design encompasses planning and structuring complex systems to fulfill both functional and non-functional requirements. It involves making crucial decisions about how different components will interact to achieve desired functionality[2]. A well-designed system aims to be:

- **Reliable**: Handles faults, failures, and errors gracefully

- **Effective**: Meets all user needs and business requirements

- **Maintainable**: Remains flexible and easy to scale or modify[9]

## Key System Design Principles

### SOLID Principles

The SOLID principles provide fundamental guidelin

In [3]:
from transformers import AutoTokenizer

# Load the tokenizer for Qwen2.5-0.5B-Instruct
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

def count_tokens(input_text):
    # Tokenize the input text
    tokens = tokenizer(input_text, return_tensors="pt")

    # Count the number of tokens
    num_tokens = len(tokens["input_ids"][0])
    
    return num_tokens

print(f"Number of tokens: {count_tokens(text)}")

Number of tokens: 2407


In [4]:
ex_text = """# Comprehensive Report on Land Resources in India

India faces unique challenges in managing its land resources due to its large population, geographic diversity, and increasing development demands. This report examines the current status, utilization patterns, challenges, and management strategies for India's land resources.

## Current Land Resource Status and Classification

India possesses a total geographical area of approximately 328.73 million hectares[1]. The land resources are classified into several distinct categories that reflect their usage and potential:

**Forest Land**: Covering 713,789 square kilometers or 21.72% of India's geographical area. Recent assessments show an increase of 1,540 square kilometers in forest cover and 721 square kilometers in tree cover compared to previous assessments[5].

**Agricultural Land**: Constitutes approximately 46% of the total geographical area as net sown area[1]. This includes:
- Arable land concentrated in the Gangetic plains, Punjab, Haryana, and parts of Maharashtra and Karnataka[4]
- Current fallow land (temporarily not cultivated during the current agricultural year)
- Other fallow land (left uncultivated for 1-5 years)[3]

**Pastures and Grazing Land**: Areas designated for livestock grazing, including permanent pastures and meadows[3].

**Land Under Miscellaneous Tree Crops**: Areas with tree crops not classified as forests, including orchards and plantations[3].

**Culturable Waste Land**: Land potentially suitable for cultivation but currently unused[3].

**Barren and Unculturable Land**: Areas that cannot be brought under cultivation due to topography, climate, or soil conditions, comprising about 8.5% of the total geographical area[1].

**Land Under Non-Agricultural Uses**: Areas used for settlements, infrastructure, industries, and other development, accounting for approximately 5.5% of the total land area[1]. This category has increased from 17 million hectares to 27 million hectares over the years[7].

## Land Utilization Trends

The land utilization pattern in India has undergone significant changes over time:

1. **Agricultural Land**: The net sown area has remained relatively constant at around 140 million hectares since the 1950s, though the gross cropped area has increased from 166 million hectares in 1971-72 to 197 million hectares in 2015-16, indicating intensification of agriculture[7].

2. **Forest Cover**: Total forest and tree cover has increased by 2,261 square kilometers compared to previous assessments, demonstrating successful afforestation efforts[5].

3. **Non-Agricultural Uses**: Land under non-agricultural uses has been steadily increasing due to urbanization, infrastructure development, and economic growth[1][7].

4. **Barren Land**: Barren and unculturable land has decreased from 28 million hectares in 1971-72 to 17 million hectares in 2015-16, reflecting land reclamation efforts[7].

5. **Cropping Intensity**: Has increased from 118% to 140% between 1971-72 and 2015-16, indicating more intensive use of available agricultural land[7].

## Land Degradation: A Critical Challenge

One of the most pressing issues affecting India's land resources is degradation:

1. **Extent**: Approximately 30% of India's total geographical area (about 115-120 million hectares) is affected by various forms of land degradation and desertification[10][11][13].

2. **Types of Degradation**:
   - Water erosion (11.01% of India's land)
   - Vegetation degradation (9.15%)
   - Wind erosion (5.46%)
   - Soil salinity and alkalinity
   - Waterlogging[9][10]

3. **Regional Distribution**: Eight states—Rajasthan, Delhi, Goa, Maharashtra, Jharkhand, Nagaland, Tripura, and Himachal Pradesh—have 40-70% of their land affected by desertification. Rajasthan alone has over 21 million hectares classified as degraded[10][13].

4. **Temporal Trends**: 26 out of 29 Indian states have reported an increase in areas undergoing desertification over the past decade[13].

5. **Wastelands**: The Wastelands Atlas of India (2019) identified 557,665 square kilometers (16.96% of total geographical area) as wastelands in 2015-16, showing a slight decrease from 566,070 square kilometers (17.22%) in 2008-09[6].

## Key Challenges in Land Resource Management

India faces multiple challenges in managing its land resources effectively:

1. **Population Pressure**: With only 2.4% of the world's geographical area supporting over 17% of the global population, India faces acute land scarcity[12]. Per capita land availability is projected to decrease from 0.32 hectares in 2001 to 0.19 hectares by 2050[7].

2. **Agricultural Productivity**: Low crop yields compared to international standards, particularly in rainfed areas growing coarse cereals, pulses, and oilseeds[9].

3. **Irrigation Dependency**: Only about 33% of cultivated area is irrigated, making the rest dependent on erratic monsoon rainfall[9].

4. **Land Degradation**: Extensive degradation due to water and wind erosion, soil salinity, alkalinity, and waterlogging affects agricultural productivity[9][10].

5. **Climate Change Impacts**: Increasing vulnerability to floods, droughts, and temperature variations affecting land productivity and increasing degradation[12].

6. **Sectoral Approach**: Fragmented land management practices with different government departments following their own approaches[8][12].

7. **Competing Land Demands**: Growing competition between agriculture and other land-based sectors (urbanization, industrialization, infrastructure) creating conflicts and escalating land prices[12].

## Government Initiatives and Management Strategies

Several initiatives have been implemented to address land resource challenges:

1. **Watershed Development**: The Government of India approved the continuation of the Watershed Development Component of Pradhan Mantri Krishi Sinchayee Yojana (WDC-PMKSY) for 2021-2026 with a physical target of 4.95 million hectares and an indicative financial outlay of ₹8,134 crore[2].

2. **Wasteland Reclamation**: Between 2008-09 and 2015-16, 14,536 square kilometers of wasteland was converted to non-wastelands, resulting in a net reduction of 8,404 square kilometers of wasteland[6].

3. **Remote Sensing and Mapping**: The National Remote Sensing Agency (NRSA) and Space Application Centre (SAC) have been mapping wastelands and monitoring land degradation using satellite imagery[1][10].

4. **Soil Health Initiatives**: Programs like the Soil Health Card Scheme to help farmers better manage their arable fields and ensure improved yields[4].

5. **Forest Conservation**: Efforts to increase forest cover have shown positive results with an increase of 2,261 square kilometers in total forest and tree cover at the national level[5].

## Recommendations for Sustainable Land Management

Based on the challenges identified, several approaches can enhance sustainable land management in India:

1. **Multi-Stakeholder Platforms**: Establish platforms at district and sub-district levels to bring together farmers, policymakers, and other stakeholders for collaborative land management[8].

2. **Landscape Approach**: Adopt a comprehensive landscape approach to assess land potential and make informed decisions about land allocation and use[8].

3. **Climate-Smart Practices**: Incorporate climate-resilient strategies in land management to address climate change impacts and enhance agricultural production[8].

4. **Integrated Land Management**: Move away from sectoral approaches toward integrated land management involving all relevant departments and stakeholders[12].

5. **Reclamation of Degraded Lands**: Intensify efforts to reclaim degraded and wasteland through appropriate soil conservation measures, afforestation, and sustainable land use practices[1][6].

6. **Technology Integration**: Utilize remote sensing, GIS, and digital technology for better land resource mapping, monitoring, and management[10].

7. **Policy Reforms**: Develop comprehensive land use policies that balance the demands of various sectors while ensuring environmental sustainability[12].

India's land resources face significant pressures, but with appropriate management strategies and policy interventions, their sustainable utilization can be ensured for future generations while meeting current development needs.
"""

ex_summarized_text = """# Comprehensive Report on Land Resources in India

India faces significant challenges in managing its land resources due to population pressure, geographic diversity, and development demands. This report summarizes the current status, utilization patterns, challenges, and management strategies for India's land resources.

## Current Land Resource Status and Classification

India has a total geographical area of approximately 328.73 million hectares, classified into:

**Forest Land**: Covers 713,789 square kilometers (21.72% of India's area), with recent assessments showing increases in both forest cover (1,540 sq km) and tree cover (721 sq km).

**Agricultural Land**: Constitutes about 46% of the total geographical area as net sown area, including:
- Arable land concentrated in the Gangetic plains, Punjab, Haryana, and parts of Maharashtra and Karnataka
- Current fallow land (temporarily not cultivated)
- Other fallow land (left uncultivated for 1-5 years)

**Pastures and Grazing Land**: Areas designated for livestock grazing.

**Land Under Miscellaneous Tree Crops**: Areas with tree crops not classified as forests.

**Culturable Waste Land**: Land potentially suitable for cultivation but currently unused.

**Barren and Unculturable Land**: Areas unsuitable for cultivation (about 8.5% of total area).

**Land Under Non-Agricultural Uses**: Areas used for settlements, infrastructure, and industries (approximately 5.5% of total land area), which has increased from 17 million to 27 million hectares over time.

## Land Utilization Trends

Key trends in India's land utilization include:

1. **Agricultural Land**: Net sown area has remained relatively constant at around 140 million hectares since the 1950s, though gross cropped area increased from 166 million hectares (1971-72) to 197 million hectares (2015-16).

2. **Forest Cover**: Total forest and tree cover has increased by 2,261 square kilometers in recent assessments.

3. **Non-Agricultural Uses**: Land under non-agricultural uses has steadily increased due to urbanization and development.

4. **Barren Land**: Decreased from 28 million hectares (1971-72) to 17 million hectares (2015-16).

5. **Cropping Intensity**: Increased from 118% to 140% between 1971-72 and 2015-16.

## Land Degradation: A Critical Challenge

Land degradation affects approximately 30% of India's total geographical area (115-120 million hectares):

1. **Types of Degradation**:
   - Water erosion (11.01% of India's land)
   - Vegetation degradation (9.15%)
   - Wind erosion (5.46%)
   - Soil salinity, alkalinity, and waterlogging

2. **Regional Distribution**: Eight states—Rajasthan, Delhi, Goa, Maharashtra, Jharkhand, Nagaland, Tripura, and Himachal Pradesh—have 40-70% of their land affected by desertification.

3. **Temporal Trends**: 26 out of 29 Indian states have reported increasing desertification over the past decade.

4. **Wastelands**: The Wastelands Atlas identified 557,665 square kilometers (16.96% of total area) as wastelands in 2015-16, down from 566,070 square kilometers (17.22%) in 2008-09.

## Key Challenges in Land Resource Management

India faces multiple challenges in effective land resource management:

1. **Population Pressure**: With only 2.4% of the world's geographical area supporting over 17% of global population, per capita land availability is projected to decrease from 0.32 hectares (2001) to 0.19 hectares by 2050.

2. **Agricultural Productivity**: Low crop yields compared to international standards, particularly in rainfed areas.

3. **Irrigation Dependency**: Only about 33% of cultivated area is irrigated, making the rest dependent on erratic monsoon rainfall.

4. **Land Degradation**: Extensive degradation affecting agricultural productivity.

5. **Climate Change Impacts**: Increasing vulnerability to floods, droughts, and temperature variations.

6. **Sectoral Approach**: Fragmented land management practices across different government departments.

7. **Competing Land Demands**: Growing competition between agriculture and other land-based sectors.

## Government Initiatives and Management Strategies

Several initiatives address land resource challenges:

1. **Watershed Development**: Continuation of the Watershed Development Component of Pradhan Mantri Krishi Sinchayee Yojana (WDC-PMKSY) for 2021-2026, targeting 4.95 million hectares with an outlay of ₹8,134 crore.

2. **Wasteland Reclamation**: Between 2008-09 and 2015-16, 14,536 square kilometers of wasteland was converted to non-wastelands.

3. **Remote Sensing and Mapping**: Using satellite imagery to map wastelands and monitor land degradation.

4. **Soil Health Initiatives**: Programs like the Soil Health Card Scheme to help farmers better manage their fields.

5. **Forest Conservation**: Efforts showing positive results with increased forest and tree cover.

## Recommendations for Sustainable Land Management

Approaches to enhance sustainable land management include:

1. **Multi-Stakeholder Platforms**: Establish platforms at district and sub-district levels for collaborative land management.

2. **Landscape Approach**: Adopt a comprehensive approach to assess land potential and make informed decisions.

3. **Climate-Smart Practices**: Incorporate climate-resilient strategies in land management.

4. **Integrated Land Management**: Move away from sectoral approaches toward integrated management.

5. **Reclamation of Degraded Lands**: Intensify efforts through soil conservation, afforestation, and sustainable practices.

6. **Technology Integration**: Utilize remote sensing, GIS, and digital technology for better management.

7. **Policy Reforms**: Develop comprehensive land use policies balancing various sectoral demands while ensuring environmental sustainability.

India's land resources face significant pressures, but with appropriate management strategies and policy interventions, their sustainable utilization can be ensured for future generations while meeting current development needs.
"""

In [5]:
template = f"""
You are tasked with summarizing a document in a clear, concise, and professional manner. 
Your summary should retain all critical information while eliminating unnecessary details. 

To guide your approach, here is an example:
Document:
{ex_text}
Summary:
{ex_summarized_text}

Now, summarize the following document:
{text}

Make sure the response is in Markdown format. Make sure to keep the details and examples of each section intact so that the details are not compormised.
Only summarize portions that are necessary.
The format of the summary should be as the document that is given to be summarized.
"""

count_tokens(template)

5814

In [None]:
# import re
# from langchain_core.prompts import PromptTemplate
# from langchain_huggingface.llms import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# model_id = "Qwen/Qwen2.5-0.5B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=3000)
# hf = HuggingFacePipeline(pipeline=pipe)

# def summarizer(input_text):
#     template = """
#     You are tasked with summarizing a document in a clear, concise, and professional manner. 
#     Your summary should retain all critical information while eliminating unnecessary details. 

#     To guide your approach, here is an example:
#     Document:
#     {ex_text}
#     Summary:
#     {ex_summarized_text}

#     Now, summarize the following document:
#     {text}

#     Make sure the response is in Markdown format.
#     """

#     prompt = PromptTemplate.from_template(template)

#     chain = prompt | hf.bind(skip_prompt=True)
#     chain = prompt| model
#     text = chain.invoke({"ex_text":ex_text,"ex_summarized_text":ex_summarized_text,"text": input_text})
#     pattern = r"```markdown\n(.*?)$"
#     result = re.search(pattern, text, re.DOTALL)

#     if result:
#         text = result.group(1).strip()

#     return (text)

In [None]:
# import re
# from langchain_core.prompts import PromptTemplate
# from langchain_huggingface.llms import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
# import torch

# # Configure quantization
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True
# )

# model_id = "Qwen/Qwen2.5-0.5B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_id)

# # Load model with quantization
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     quantization_config=quantization_config,
#     device_map="auto"
# )

# # Configure generation parameters for efficiency
# pipe = pipeline(
#     "text-generation", 
#     model=model, 
#     tokenizer=tokenizer, 
#     max_new_tokens=3000,
#     torch_dtype=torch.float16
# )

# hf = HuggingFacePipeline(pipeline=pipe)

# def summarizer(input_text):
#     template = """
#     You are tasked with summarizing a document in a clear, concise, and professional manner. 
#     Your summary should retain all critical information while eliminating unnecessary details. 

#     To guide your approach, here is an example:
#     Document:
#     {ex_text}
#     Summary:
#     {ex_summarized_text}

#     Now, summarize the following document:
#     {text}

#     Make sure the response is in Markdown format.
#     """

#     prompt = PromptTemplate.from_template(template)

#     chain = prompt | hf.bind(skip_prompt=True)
#     text = chain.invoke({"ex_text":ex_text,"ex_summarized_text":ex_summarized_text,"text": input_text})

#     pattern = r"```markdown\n(.*?)```"
#     result = re.search(pattern, text, re.DOTALL)

#     if result:
#         text = result.group(1).strip()
    
#     return (text)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

In [None]:
# import re
# from langchain_core.prompts import PromptTemplate
# from langchain_ollama.llms import OllamaLLM

# def summarizer(input_text):
#     model = OllamaLLM(
#         model="qwen2.5:0.5b",
#         temperature=0,
#     )

#     template = """
#     You are tasked with summarizing a document in a clear, concise, and professional manner. 
#     Your summary should retain all critical information while eliminating unnecessary details. 

#     To guide your approach, here is an example:
#     Document:
#     {ex_text}
#     Summary:
#     {ex_summarized_text}

#     Now, summarize the following document:
#     {text}

#     Make sure the response is in Markdown format.
#     """

#     prompt = PromptTemplate.from_template(template)

#     chain = prompt| model
#     text = chain.invoke({"ex_text":ex_text,"ex_summarized_text":ex_summarized_text,"text": input_text})

#     pattern = r"```markdown\n(.*?)```"
#     result = re.search(pattern, text, re.DOTALL)

#     if result:
#         text = result.group(1).strip()
    
#     return (text)

In [7]:
# pipeline
while count_tokens(text) >= 1500:
    text = summarizer(text)
    print(f"Number of tokens: {count_tokens(text)}")
    print(text)
    print("------------------------------------------------------------")

Number of tokens: 539
```markdown
# System Design: A Comprehensive Guide for Beginners

## Introduction to System Design

System design is a crucial aspect of software development, focusing on how systems are built and managed. It involves understanding the requirements, designing the architecture, implementing the system, and ensuring its reliability and scalability.

### Key Concepts in System Design

1. **Requirements Gathering**: Identifying user needs and defining the system's goals.
2. **System Architecture**: Planning the overall structure of the system, including components, interfaces, and communication mechanisms.
3. **Database Management**: Designing the database schema to store data efficiently and securely.
4. **User Interface (UI)**: Creating a user-friendly interface for users interacting with the system.
5. **Performance Optimization**: Ensuring that the system can handle high loads and perform well under various conditions.

### Best Practices in System Design

1. **Mo

In [1]:
import os
from pathlib import Path
import asyncio

os.chdir('..')
os.chdir('components')
print(Path('.').cwd())

from doc_pipeline.pipeline import DocumentProcessor
processor = DocumentProcessor()
result = processor.process_document()
print(result)
# os.chdir('..')
# os.chdir('transcripter_api')
# print(Path('.').cwd())
# from transcripter import transcribe_audio
# transcribe_audio()


/home/prasun/Desktop/ADHYAYAN_MITRA/components


ModuleNotFoundError: No module named 'doc_ex'

In [6]:
processor = DocumentProcessor()
result = processor.process_document()
print(result)

Document exceeds 2000 tokens. Summarizing...
# System Design: A Comprehensive Guide for Beginners

## Introduction to System Design

System design is a critical aspect of software development, ensuring that an application or system meets its functional and non-functional requirements. It involves several key components:

1. **Requirements Gathering**: Identifying the user needs and defining the system's goals.
2. **Architecture Planning**: Defining the overall structure and components of the system.
3. **Design**: Creating a detailed blueprint for how the system will be built.
4. **Implementation**: Writing code to implement the design.
5. **Testing**: Ensuring that the system meets its functional and non-functional requirements.

## Key Concepts in System Design

- **Scalability**: The ability of an application or system to handle increased loads without degradation in performance.
- **Reliability**: The assurance that a system will operate correctly under various conditions.
- **Main

## Testing

In [1]:
import sys
import os

# Add the parent directory to the path so Python can find the components
sys.path.append(os.path.abspath('..'))

In [2]:
# Now you can import components
# from components.audio_recorder.recorder  import AudioRecorder
from components.doc_pipeline.pipeline import DocumentProcessor
from components.transcripter_api import transcripter 
# from components.whisper.whisper_small import AudioTranscriptor

In [3]:
trans_result = transcripter.transcribe_audio()
doc_result = DocumentProcessor().process_document()
print(trans_result)
print(doc_result)

Processing: /home/prasun/Desktop/ADHYAYAN_MITRA/components/transcripter_api/recording.mp3


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription completed in 8.0s
Transcript saved to /home/prasun/Desktop/ADHYAYAN_MITRA/components/transcripter_api/transcript.txt
Document exceeds 2000 tokens. Summarizing...
 So, hey there. I hope you are doing alright. I think everything is going as per your requirements. So, just checking in and do let me know if there are any requirements or changes needed for me to do. Actually the thing is the most important thing that I want to say right now to use. I have been facing some issues regarding the transcription as it is a messy pipeline and all the things are not getting transmitted.  transcribe a accordingly. For example, if I talk Hindi, I am talking Hindi. I don't have a transcribe. Thanks.
# System Design: A Comprehensive Guide for Beginners

## Introduction to System Design

System design is a critical aspect of software development that involves planning, analyzing, and structuring the components and interactions within an application. It's essential for ensuring that the sys

# Building the Gap_Analysis Pipeline

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_ollama.llms import OllamaLLM

gap_analysis_prompt = PromptTemplate.from_template(
    """
    You are tasked with analyzing the gap between two documents. 
    Your analysis should be clear, concise, and professional. 
    Focus on identifying the differences and similarities between the two documents.

    Document 1:
    {doc1}

    Document 2:
    {doc2}

    Provide the GAP Analysis of 
    """
)
llm = OllamaLLM(
    model="hf.co/mradermacher/Qwen2.5-0.5B-Instruct-GGUF:Q8_0",
    temperature=0,
)
chain = gap_analysis_prompt | llm 
analysis = chain.invoke({"doc1": doc_result,"doc2": trans_result})
print(analysis)

Certainly! Here's an analysis of Document 2:

### Document 2: Transcription Issues

#### Context and Purpose
The document is about transcription issues in a messy pipeline, specifically mentioning that the transcription process is not working as expected. The user is seeking help to resolve these issues.

#### Key Points:
1. **Transcription Process**: The user mentions that they are having problems with the transcription process.
2. **Pipeline Messy**: The text indicates that there's a "messy pipeline" where transcriptions are not being transmitted properly.
3. **User Request**: The user is requesting help to resolve these issues.

#### Analysis of Transcription Issues:
1. **Transcription Quality**: The document mentions that the transcription process is not working well, which suggests poor quality or accuracy in the transcription.
2. **Pipeline Complexity**: The mention of a "messy pipeline" implies that there are multiple steps and layers involved in the transcription process, makin