In [1]:
import pandas as pd
import csv

def remove_duplicate_titles(input_file, output_file=None, title_column='शीर्षक', description_column='विवरण'):
    """
    Remove duplicate titles from a CSV file, keeping only the first occurrence.
    
    Parameters:
    input_file (str): Path to the input CSV file
    output_file (str): Path to the output CSV file (optional, defaults to input_file with '_cleaned' suffix)
    title_column (str): Name of the title column (default: 'title')
    description_column (str): Name of the description column (default: 'description')
    """
    
    try:
        # Read the CSV file
        df = pd.read_csv(input_file)
        
        # Check if required columns exist
        if title_column not in df.columns:
            print(f"Error: Column '{title_column}' not found in the CSV file.")
            print(f"Available columns: {list(df.columns)}")
            return
        
        if description_column not in df.columns:
            print(f"Error: Column '{description_column}' not found in the CSV file.")
            print(f"Available columns: {list(df.columns)}")
            return
        
        # Display initial statistics
        initial_count = len(df)
        duplicate_count = df[title_column].duplicated().sum()
        
        print(f"Initial number of rows: {initial_count}")
        print(f"Number of duplicate titles found: {duplicate_count}")
        
        # Remove duplicates based on title column, keeping first occurrence
        df_cleaned = df.drop_duplicates(subset=[title_column], keep='first')
        
        # Display final statistics
        final_count = len(df_cleaned)
        removed_count = initial_count - final_count
        
        print(f"Number of rows after cleaning: {final_count}")
        print(f"Number of rows removed: {removed_count}")
        
        # Set output file name if not provided
        if output_file is None:
            output_file = input_file.replace('.csv', '.csv')
        
        # Save the cleaned data
        df_cleaned.to_csv(output_file, index=False)
        print(f"Cleaned data saved to: {output_file}")
        
        # Display some examples of removed duplicates if any
        if duplicate_count > 0:
            print("\nExamples of removed duplicate titles:")
            duplicates = df[df[title_column].duplicated(keep=False)]
            unique_duplicate_titles = duplicates[title_column].unique()[:5]  # Show first 5 duplicate titles
            for title in unique_duplicate_titles:
                print(f"- '{title}'")
        
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{input_file}' is empty.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

def remove_duplicate_titles_manual(input_file, output_file=None, title_column_index=0, description_column_index=1):
    """
    Alternative method using manual CSV processing (without pandas)
    
    Parameters:
    input_file (str): Path to the input CSV file
    output_file (str): Path to the output CSV file
    title_column_index (int): Index of the title column (0-based)
    description_column_index (int): Index of the description column (0-based)
    """
    
    try:
        seen_titles = set()
        cleaned_rows = []
        
        with open(input_file, 'r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            header = next(reader)  # Read header
            cleaned_rows.append(header)
            
            initial_count = 0
            for row in reader:
                initial_count += 1
                title = row[title_column_index].strip()
                
                if title not in seen_titles:
                    seen_titles.add(title)
                    cleaned_rows.append(row)
        
        # Set output file name if not provided
        if output_file is None:
            output_file = input_file.replace('.csv', '_cleaned_manual.csv')
        
        # Write cleaned data
        with open(output_file, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerows(cleaned_rows)
        
        final_count = len(cleaned_rows) - 1  # Subtract header row
        removed_count = initial_count - final_count
        
        print(f"Initial number of rows: {initial_count}")
        print(f"Number of rows after cleaning: {final_count}")
        print(f"Number of rows removed: {removed_count}")
        print(f"Cleaned data saved to: {output_file}")
        
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Example usage
if __name__ == "__main__":
    # Example 1: Using pandas (recommended)
    input_csv = "News_csv/AnnapurnaPost/२०८२/असार १८_annupurna_post.csv"
    
    # Method 1: Automatic column detection (assumes columns are named 'title' and 'description')
    remove_duplicate_titles(input_csv)
    
    # Method 2: Specify custom column names
    # remove_duplicate_titles(input_csv, title_column='Title', description_column='Description')
    
    # Method 3: Manual processing without pandas
    # remove_duplicate_titles_manual(input_csv, title_column_index=0, description_column_index=1)
    
    # Method 4: Specify custom output file
    # remove_duplicate_titles(input_csv, output_file="cleaned_output.csv")

Initial number of rows: 545
Number of duplicate titles found: 539
Number of rows after cleaning: 6
Number of rows removed: 539
Cleaned data saved to: News_csv/AnnapurnaPost/२०८२/असार १८_annupurna_post.csv

Examples of removed duplicate titles:
- 'दाउन्नेको बेहाल, यात्रा कष्टकर'
- 'राज्य व्यवस्था समितिमा एमाले, माओवादी र एकीकृत समाजवादीले मागे रामहरि खतिवडाको राजीनामा'
- 'तिब्बतको उत्तराधिकारीबारे दलाई लामाको उद्घोष, चीन तरंगित'
- ''डेटिङ एप' सञ्चालन गर्ने ६ चिनियाँसहित ५२ जना पक्राउ'
- 'जापानले गर्यो चार नेपालीलाई निष्कासन'
