In [9]:
import pandas as pd
import csv

def remove_duplicate_titles(input_file, output_file=None, title_column='शीर्षक', description_column='विवरण'):
    """
    Remove duplicate titles from a CSV file, keeping only the first occurrence.
    
    Parameters:
    input_file (str): Path to the input CSV file
    output_file (str): Path to the output CSV file (optional, defaults to input_file with '_cleaned' suffix)
    title_column (str): Name of the title column (default: 'title')
    description_column (str): Name of the description column (default: 'description')
    """
    
    try:
        # Read the CSV file
        df = pd.read_csv(input_file)
        
        # Check if required columns exist
        if title_column not in df.columns:
            print(f"Error: Column '{title_column}' not found in the CSV file.")
            print(f"Available columns: {list(df.columns)}")
            return
        
        if description_column not in df.columns:
            print(f"Error: Column '{description_column}' not found in the CSV file.")
            print(f"Available columns: {list(df.columns)}")
            return
        
        # Display initial statistics
        initial_count = len(df)
        duplicate_count = df[title_column].duplicated().sum()
        
        print(f"Initial number of rows: {initial_count}")
        print(f"Number of duplicate titles found: {duplicate_count}")
        
        # Remove duplicates based on title column, keeping first occurrence
        df_cleaned = df.drop_duplicates(subset=[title_column], keep='first')
        
        # Display final statistics
        final_count = len(df_cleaned)
        removed_count = initial_count - final_count
        
        print(f"Number of rows after cleaning: {final_count}")
        print(f"Number of rows removed: {removed_count}")
        
        # Set output file name if not provided
        if output_file is None:
            output_file = input_file.replace('.csv', '.csv')
        
        # Save the cleaned data
        df_cleaned.to_csv(output_file, index=False)
        print(f"Cleaned data saved to: {output_file}")
        
        # Display some examples of removed duplicates if any
        if duplicate_count > 0:
            print("\nExamples of removed duplicate titles:")
            duplicates = df[df[title_column].duplicated(keep=False)]
            unique_duplicate_titles = duplicates[title_column].unique()[:5]  # Show first 5 duplicate titles
            for title in unique_duplicate_titles:
                print(f"- '{title}'")
        
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{input_file}' is empty.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

def remove_duplicate_titles_manual(input_file, output_file=None, title_column_index=0, description_column_index=1):
    """
    Alternative method using manual CSV processing (without pandas)
    
    Parameters:
    input_file (str): Path to the input CSV file
    output_file (str): Path to the output CSV file
    title_column_index (int): Index of the title column (0-based)
    description_column_index (int): Index of the description column (0-based)
    """
    
    try:
        seen_titles = set()
        cleaned_rows = []
        
        with open(input_file, 'r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            header = next(reader)  # Read header
            cleaned_rows.append(header)
            
            initial_count = 0
            for row in reader:
                initial_count += 1
                title = row[title_column_index].strip()
                
                if title not in seen_titles:
                    seen_titles.add(title)
                    cleaned_rows.append(row)
        
        # Set output file name if not provided
        if output_file is None:
            output_file = input_file.replace('.csv', '_cleaned_manual.csv')
        
        # Write cleaned data
        with open(output_file, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerows(cleaned_rows)
        
        final_count = len(cleaned_rows) - 1  # Subtract header row
        removed_count = initial_count - final_count
        
        print(f"Initial number of rows: {initial_count}")
        print(f"Number of rows after cleaning: {final_count}")
        print(f"Number of rows removed: {removed_count}")
        print(f"Cleaned data saved to: {output_file}")
        
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# # Example usage
# if __name__ == "__main__":
#     # Example 1: Using pandas (recommended)
#     input_csv = 'News_csv/AnnapurnaPost/२०८२/असार २८_annupurna_post.csv'
    
#     # Method 1: Automatic column detection (assumes columns are named 'title' and 'description')
#     remove_duplicate_titles(input_csv)
    
#     # Method 2: Specify custom column names
#     # remove_duplicate_titles(input_csv, title_column='Title', description_column='Description')
    
#     # Method 3: Manual processing without pandas
#     # remove_duplicate_titles_manual(input_csv, title_column_index=0, description_column_index=1)
    
#     # Method 4: Specify custom output file
#     # remove_duplicate_titles(input_csv, output_file="cleaned_output.csv")

In [14]:
import os

folder_path = 'News_csv/AnnapurnaPost/२०८२'
# Get all CSV files in the specified folder
items= os.listdir(folder_path)
for  csv_file in items:
    remove_duplicate_titles(folder_path + '/' + csv_file)
    # print(f"Processing file: {csv_file}")
    




Initial number of rows: 119
Number of duplicate titles found: 52
Number of rows after cleaning: 67
Number of rows removed: 52
Cleaned data saved to: News_csv/AnnapurnaPost/२०८२/असार ११_annupurna_post.csv

Examples of removed duplicate titles:
- 'आइएमई ग्रुपको लगानीमा नेपालमै पहिलो फाइबर सिमेन्ट बोर्ड उद्योग सञ्चालनमा'
- 'माधव नेपाललाई ३५ लाख धरौटीमा छाड्न आदेश'
- 'पतञ्जली जग्गा हिनामिना प्रकरण : माधव नेपालको थुनछेक बहस सकियो'
- 'रोशी खोलामा एक व्यक्तिको शव फेला'
- 'लागुऔषधसहित एक जना पक्राउ'
Initial number of rows: 104
Number of duplicate titles found: 47
Number of rows after cleaning: 57
Number of rows removed: 47
Cleaned data saved to: News_csv/AnnapurnaPost/२०८२/असार १२_annupurna_post.csv

Examples of removed duplicate titles:
- 'दृष्टिका सञ्चालक श्रेष्ठ र सम्पादक सुब्बालाई जनही २५ हजार धरौटीमा छोड्न आदेश'
- 'रास्वपा सांसद सुमनाको प्रस्ताव निजी विद्यालयलाई १५ वर्षपछि गैरनाफामूलक बनाऔं'
- 'रुपन्देही ३ मा मोर्चाका तर्फबाट नागरिक उन्मुक्ति पार्टीको दाबेदारी'
- 'डोटी पूर्णखोप सुनिश्चितत

In [5]:
# Different ways to get files from a folder in Python

import os
import glob
from pathlib import Path

# Method 1: Using os.listdir() - gets all items in directory
def get_files_with_listdir(folder_path):
    """Get all files using os.listdir()"""
    try:
        items = os.listdir(folder_path)
        # Filter only files (not directories)
        files = [item for item in items if os.path.isfile(os.path.join(folder_path, item))]
        return files
    except FileNotFoundError:
        print(f"Folder '{folder_path}' not found")
        return []

# Method 2: Using os.walk() - recursive through subdirectories
def get_files_with_walk(folder_path):
    """Get all files recursively using os.walk()"""
    files = []
    for root, dirs, file_list in os.walk(folder_path):
        for file in file_list:
            files.append(os.path.join(root, file))
    return files

# Method 3: Using glob - pattern matching
def get_files_with_glob(folder_path, pattern="*"):
    """Get files using glob with pattern matching"""
    search_pattern = os.path.join(folder_path, pattern)
    return glob.glob(search_pattern)

# Method 4: Using pathlib (modern Python approach)
def get_files_with_pathlib(folder_path):
    """Get files using pathlib - modern Python way"""
    folder = Path(folder_path)
    if folder.exists():
        return [file for file in folder.iterdir() if file.is_file()]
    else:
        print(f"Folder '{folder_path}' not found")
        return []

# Method 5: Get files with specific extensions
def get_files_by_extension(folder_path, extensions):
    """Get files with specific extensions"""
    files = []
    if isinstance(extensions, str):
        extensions = [extensions]
    
    for file in os.listdir(folder_path):
        if any(file.endswith(ext) for ext in extensions):
            files.append(os.path.join(folder_path, file))
    return files

# Examples
print("=== Example Usage ===")

# Example folder path
folder_path = "News_csv"

print(f"Getting files from: {folder_path}")
print()

# Method 1: os.listdir()
print("1. Using os.listdir():")
files = get_files_with_listdir(folder_path)
for file in files[:5]:  # Show first 5 files
    print(f"   {file}")
if len(files) > 5:
    print(f"   ... and {len(files) - 5} more files")
print()

# Method 2: os.walk() - recursive
print("2. Using os.walk() (recursive):")
all_files = get_files_with_walk(folder_path)
for file in all_files[:5]:  # Show first 5 files
    print(f"   {file}")
if len(all_files) > 5:
    print(f"   ... and {len(all_files) - 5} more files")
print()

# Method 3: glob with pattern
print("3. Using glob for CSV files:")
csv_files = get_files_with_glob(folder_path, "*.csv")
for file in csv_files[:5]:
    print(f"   {file}")
if len(csv_files) > 5:
    print(f"   ... and {len(csv_files) - 5} more files")
print()

# Method 4: pathlib
print("4. Using pathlib:")
path_files = get_files_with_pathlib(folder_path)
for file in path_files[:5]:
    print(f"   {file}")
if len(path_files) > 5:
    print(f"   ... and {len(path_files) - 5} more files")
print()

# Method 5: Specific extensions
print("5. Getting only .csv and .txt files:")
specific_files = get_files_by_extension(folder_path, ['.csv', '.txt'])
for file in specific_files[:5]:
    print(f"   {file}")
if len(specific_files) > 5:
    print(f"   ... and {len(specific_files) - 5} more files")

=== Example Usage ===
Getting files from: News_csv

1. Using os.listdir():

2. Using os.walk() (recursive):
   News_csv/eKantipur/2025-06-02_ekantipur_news.csv
   News_csv/eKantipur/2025-06-09_ekantipur_news.csv
   News_csv/eKantipur/2025-06-03_ekantipur_news.csv
   News_csv/eKantipur/2025-06-04_ekantipur_news.csv
   News_csv/eKantipur/2025-06-05_ekantipur_news.csv
   ... and 6231 more files

3. Using glob for CSV files:

4. Using pathlib:

5. Getting only .csv and .txt files:


In [None]:
# More practical examples

print("=== PRACTICAL EXAMPLES ===")
print()

# Example 1: Get all CSV files recursively
print("1. All CSV files in News_csv directory:")
csv_files = []
for root, dirs, files in os.walk("News_csv"):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

print(f"Found {len(csv_files)} CSV files")
for file in csv_files[:5]:
    print(f"   {file}")
if len(csv_files) > 5:
    print(f"   ... and {len(csv_files) - 5} more")
print()

# Example 2: Get files with size information
print("2. Files with size information:")
folder = "News_csv/eKantipur"
if os.path.exists(folder):
    files_with_size = []
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path)
            files_with_size.append((file, size))
    
    # Sort by size (largest first)
    files_with_size.sort(key=lambda x: x[1], reverse=True)
    
    for file, size in files_with_size[:5]:
        print(f"   {file}: {size:,} bytes")
    print()

# Example 3: Filter files by date in filename
print("3. Files from June 2025:")
june_files = []
for file in csv_files:
    if "2025-06" in file:
        june_files.append(file)

print(f"Found {len(june_files)} files from June 2025")
for file in june_files[:3]:
    print(f"   {os.path.basename(file)}")
print()

# Example 4: Using pathlib for modern approach
print("4. Using pathlib (recommended for new code):")
from pathlib import Path

news_path = Path("News_csv")
if news_path.exists():
    # Get all .csv files recursively
    csv_files_pathlib = list(news_path.rglob("*.csv"))
    print(f"Found {len(csv_files_pathlib)} CSV files using pathlib")
    
    # Get files from specific subfolder
    ekantipur_path = news_path / "eKantipur"
    if ekantipur_path.exists():
        ekantipur_files = list(ekantipur_path.glob("*.csv"))
        print(f"Found {len(ekantipur_files)} files in eKantipur folder")
        for file in ekantipur_files[:3]:
            print(f"   {file.name}")
print()

# Example 5: Get files modified in last N days
print("5. Recently modified files:")
import time
from datetime import datetime, timedelta

def get_recent_files(folder, days=7):
    """Get files modified in the last N days"""
    recent_files = []
    cutoff_time = time.time() - (days * 24 * 60 * 60)
    
    for root, dirs, files in os.walk(folder):
        for file in files:
            file_path = os.path.join(root, file)
            if os.path.getmtime(file_path) > cutoff_time:
                recent_files.append(file_path)
    
    return recent_files

recent = get_recent_files("News_csv", days=30)
print(f"Files modified in last 30 days: {len(recent)}")
for file in recent[:3]:
    mod_time = os.path.getmtime(file)
    mod_date = datetime.fromtimestamp(mod_time).strftime("%Y-%m-%d %H:%M")
    print(f"   {os.path.basename(file)} (modified: {mod_date})")

# Summary: Getting Files from Folders in Python

## Quick Reference

### 1. **os.listdir()** - Simple file listing
```python
import os
files = os.listdir("folder_path")
# Returns: ['file1.txt', 'file2.csv', 'subfolder']
```

### 2. **os.walk()** - Recursive directory traversal
```python
for root, dirs, files in os.walk("folder_path"):
    for file in files:
        full_path = os.path.join(root, file)
```

### 3. **glob** - Pattern matching
```python
import glob
csv_files = glob.glob("folder_path/*.csv")  # All CSV files
all_files = glob.glob("folder_path/**", recursive=True)  # Recursive
```

### 4. **pathlib** - Modern Python approach (Recommended)
```python
from pathlib import Path
folder = Path("folder_path")
files = [f for f in folder.iterdir() if f.is_file()]  # Files only
csv_files = list(folder.glob("*.csv"))  # CSV files only
all_csv = list(folder.rglob("*.csv"))  # Recursive CSV search
```

## When to Use Each Method:

- **os.listdir()**: Simple, single directory, when you need just filenames
- **os.walk()**: When you need recursive search with full control
- **glob**: When you need pattern matching (wildcards)
- **pathlib**: Modern, object-oriented approach (recommended for new code)

## Common Patterns:

```python
# Get only files (not directories)
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

# Get files with specific extensions
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]

# Get full file paths
full_paths = [os.path.join(path, f) for f in os.listdir(path)]
```