In [None]:
!pip install pdfplumber openpyxl

In [None]:
import re
import pdfplumber
import pandas as pd
from collections import namedtuple
from openpyxl import Workbook

# Adjusted named tuple to combine 'code', 'college', and everything else into 'code_college'
Line = namedtuple('Line', 'sr_no air neet_roll_no cet_form_no first_name middle_name last_name gender category quota code_college')

# Adjust the regular expressions based on your PDF structure
line_re = re.compile(r'\d+\s+\d+\s+\d+\s+\d+\s+.+?\s+[MF]\s+.+?\s+.+?\s+\d+:.+')

file = 'example.pdf'
lines = []

with pdfplumber.open(file) as pdf:
    pages = pdf.pages
    for page in pages:
        text = page.extract_text()
        for line in text.split('\n'):
            if line_re.match(line):
                items = line.split()
                # Print the items and their length to inspect
                print(f"Length: {len(items)}, Items: {items}")
                
                # Identify the gender index to correctly capture the name
                gender_index = 5
                for i in range(4, len(items)):
                    if items[i] in ('M', 'F'):
                        gender_index = i
                        break
                
                sr_no = items[0]
                air = items[1]
                neet_roll_no = items[2]
                cet_form_no = items[3]
                
                name_parts = items[4:gender_index]
                if len(name_parts) == 2:
                    first_name = name_parts[0]
                    last_name = name_parts[1]
                    middle_name = ''
                elif len(name_parts) == 3:
                    first_name = name_parts[0]
                    middle_name = name_parts[1]
                    last_name = name_parts[2]
                else:
                    first_name = name_parts[0]
                    last_name = ' '.join(name_parts[1:])
                    middle_name = ''
                
                gender = items[gender_index]
                category = items[gender_index + 1]
                quota = items[gender_index + 2]
                code_college = ' '.join(items[gender_index + 3:])
                
                lines.append(Line(sr_no, air, neet_roll_no, cet_form_no, first_name, middle_name, last_name, gender, category, quota, code_college))

# Create a pandas DataFrame
df = pd.DataFrame(lines)

# Save the DataFrame to an Excel file
df.to_csv('output.csv', index=False)

In [None]:
df = pd.DataFrame(lines)
df.head()

In [None]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('output.csv')

# Function to extract text before numbers
def extract_text_before_numbers(value):
    match = re.match(r"([^\d]+)", value)
    return match.group(0).strip() if match else ''

# Apply the function to the 'cod_college' column
df['new_column'] = df['code_college'].apply(extract_text_before_numbers)

# Function to remove text before numbers in 'cod_college'
def remove_text_before_numbers(value):
    match = re.search(r"\d.*", value)  # Find the first occurrence of a digit and everything after it
    return match.group(0).strip() if match else value  # Return the matched part or the original value if no match

# Apply the function to the 'cod_college' column
df['code_college'] = df['code_college'].apply(remove_text_before_numbers)

# Merge 'Quota' and 'new_column' into a single column
df['Quota'] = df['category'].fillna('') + ' ' + df['quota'].fillna('') + df['new_column'].fillna('')
# Save the updated DataFrame to a new CSV file


print(df)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_file.csv', index=False)


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('updated_file.csv')

# Group by college, category, and gender to find max AIR
max_air_df = df.loc[df.groupby(['code_college', 'category', 'gender', 'Quota'])['air'].idxmax()]

# Rename columns to indicate max
max_air_df = max_air_df.rename(columns={'air': 'max_air', 'sr_no': 'max_sr_no', 'neet_roll_no': 'max_neet_roll_no', 'cet_form_no': 'max_cet_form_no', 'Quota': 'max_quota'})

# Select relevant columns for the final output
columns_to_keep = [
    'code_college', 'category', 'gender',
    'max_air', 'max_quota',
]

# Display the result
final_df = max_air_df[columns_to_keep]

# Convert to CSV
final_df.to_csv('output_Max.csv', index=False)
print(final_df)