In [None]:
!pip install pdfplumber openpyxl

In [1]:
import re
import pdfplumber
import pandas as pd
from collections import namedtuple

# Define the named tuple
Line = namedtuple('Line', 'sr_no air neet_roll_no cet_form_no first_name middle_name last_name gender category quota code_college')

# Adjust the regular expression to be more flexible and comprehensive
line_re = re.compile(r'^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.*?)\s+(M|F)\s+(\S+)\s+(\S+)\s+(.*)$')

file = '5_6170461119094723429.pdf'
lines = []

with pdfplumber.open(file) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        for line in text.split('\n'):
            match = line_re.match(line)
            if match:
                sr_no, air, neet_roll_no, cet_form_no, name, gender, category, quota, code_college = match.groups()
                
                # Split the name into parts
                name_parts = name.split()
                if len(name_parts) == 2:
                    first_name, last_name = name_parts
                    middle_name = ''
                elif len(name_parts) == 3:
                    first_name, middle_name, last_name = name_parts
                else:
                    first_name = name_parts[0]
                    last_name = ' '.join(name_parts[1:])
                    middle_name = ''
                
                lines.append(Line(sr_no, air, neet_roll_no, cet_form_no, first_name, middle_name, last_name, gender, category, quota, code_college))

# Create a pandas DataFrame
df = pd.DataFrame(lines)

# Save the DataFrame to a CSV file
df.to_csv('output.csv', index=False)

In [3]:
df

Unnamed: 0,sr_no,air,neet_roll_no,cet_form_no,first_name,middle_name,last_name,gender,category,quota,code_college
0,1,80209,3109180336,245006685,BHALERAO,OMKAR,PRABHAKAR,M,SEBC,OPEN,6124:SKN PT COL PUNE (Canc./NJ.)
1,2,86214,2001760049,245032931,SAPNA,,BIPIN BAHAR UPADHYAY,F,OPEN,(W),6102:GS PT MUMBAI (Canc./NJ.)
2,3,101486,3110010639,245062924,ADITI,,SHAH,F,OPEN,6102:GS,PT MUMBAI (Ret.)
3,4,119465,3126030086,245045952,PATLE,DIYA,CHAITANYASHWAR,F,OBC,OPEN,(W) 6261:B MULAK PT NAGPUR (Canc./NJ.)
4,5,130187,3111120520,245036271,VAIDEHI,UMESH,RAHANGDALE,F,OBC,OPEN,(W) 6261:B MULAK PT NAGPUR (Canc./NJ.)
...,...,...,...,...,...,...,...,...,...,...,...
2494,2495,2318938,3124040430,245057668,MATTE,DHANSHREE,SANJAY,F,OBC,EMOBCW,(EMR) 6246:SSS R PANDAV PT NAGPUR
2495,2496,2322726,3118050038,245034350,VAIBHAVI,VINOD,MORE,F,Choice,Not,Available
2496,2497,2327351,3109440065,245027139,SONALE,SHRAVASTI,SIDDHARTH,F,SC,Choice,Not Available
2497,2498,2327517,3112320154,245040142,PATIL,PARAS,NARAYANRAO,M,Choice,Not,Available


In [5]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('output.csv')


# Merge 'Quota' and 'new_column' into a single column
df['college_quota_mrge'] = df['quota'].fillna('') + ' ' + df['code_college'].fillna('') 
df = df.drop(columns=['code_college'])

print(df)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_file.csv', index=False)

      sr_no      air  neet_roll_no  cet_form_no first_name middle_name  \
0         1    80209    3109180336    245006685   BHALERAO       OMKAR   
1         2    86214    2001760049    245032931      SAPNA         NaN   
2         3   101486    3110010639    245062924      ADITI         NaN   
3         4   119465    3126030086    245045952      PATLE        DIYA   
4         5   130187    3111120520    245036271    VAIDEHI       UMESH   
...     ...      ...           ...          ...        ...         ...   
2494   2495  2318938    3124040430    245057668      MATTE   DHANSHREE   
2495   2496  2322726    3118050038    245034350   VAIBHAVI       VINOD   
2496   2497  2327351    3109440065    245027139     SONALE   SHRAVASTI   
2497   2498  2327517    3112320154    245040142      PATIL       PARAS   
2498   2499  2330126    3109010579    245054760    GAIKWAD        NEHA   

                 last_name gender category    quota  \
0                PRABHAKAR      M     SEBC     OPEN   
1

In [7]:
# Function to extract the part before the 4-digit number
def extract_before_4_digits(text):
    match = re.search(r'(.*?)(?=\d{4})', text)
    return match.group(0) if match else text

# Apply the function to the college_quota_merge column and create a new column
df['new_column'] = df['college_quota_mrge'].apply(extract_before_4_digits)
print(df)
df.to_csv('updated_file.csv', index=False)

      sr_no      air  neet_roll_no  cet_form_no first_name middle_name  \
0         1    80209    3109180336    245006685   BHALERAO       OMKAR   
1         2    86214    2001760049    245032931      SAPNA         NaN   
2         3   101486    3110010639    245062924      ADITI         NaN   
3         4   119465    3126030086    245045952      PATLE        DIYA   
4         5   130187    3111120520    245036271    VAIDEHI       UMESH   
...     ...      ...           ...          ...        ...         ...   
2494   2495  2318938    3124040430    245057668      MATTE   DHANSHREE   
2495   2496  2322726    3118050038    245034350   VAIBHAVI       VINOD   
2496   2497  2327351    3109440065    245027139     SONALE   SHRAVASTI   
2497   2498  2327517    3112320154    245040142      PATIL       PARAS   
2498   2499  2330126    3109010579    245054760    GAIKWAD        NEHA   

                 last_name gender category    quota  \
0                PRABHAKAR      M     SEBC     OPEN   
1

In [9]:
# Regex to keep everything starting from the first 4-digit number
df['college_quota_mrge'] = df['college_quota_mrge'].str.extract(r'(\d{4}.*)')


# Display the filtered DataFrame
print(df)
df.to_csv('updated_file.csv', index=False)


      sr_no      air  neet_roll_no  cet_form_no first_name middle_name  \
0         1    80209    3109180336    245006685   BHALERAO       OMKAR   
1         2    86214    2001760049    245032931      SAPNA         NaN   
2         3   101486    3110010639    245062924      ADITI         NaN   
3         4   119465    3126030086    245045952      PATLE        DIYA   
4         5   130187    3111120520    245036271    VAIDEHI       UMESH   
...     ...      ...           ...          ...        ...         ...   
2494   2495  2318938    3124040430    245057668      MATTE   DHANSHREE   
2495   2496  2322726    3118050038    245034350   VAIBHAVI       VINOD   
2496   2497  2327351    3109440065    245027139     SONALE   SHRAVASTI   
2497   2498  2327517    3112320154    245040142      PATIL       PARAS   
2498   2499  2330126    3109010579    245054760    GAIKWAD        NEHA   

                 last_name gender category    quota  \
0                PRABHAKAR      M     SEBC     OPEN   
1

In [11]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('Allied stray vacancy R-2.csv')

df_cleaned = df[~df['quota'].str.contains('disqualified|Choice Not Available|by State-MBBS|by MCC|Not Available|by DGHS|Choice', case=False, na=False)]

# Display the filtered DataFrame
print(df_cleaned)
df_cleaned.to_csv('updated_file1.csv', index=False)

      sr_no      air  neet_roll_no  cet_form_no first_name middle_name  \
0         1    80209    3109180336    245006685   BHALERAO       OMKAR   
1         2    86214    2001760049    245032931      SAPNA         NaN   
2         3   101486    3110010639    245062924      ADITI         NaN   
3         4   119465    3126030086    245045952      PATLE        DIYA   
4         5   130187    3111120520    245036271    VAIDEHI       UMESH   
...     ...      ...           ...          ...        ...         ...   
2490   2491  2308276    3107070455    245061989    MAHANOR      ANURAG   
2491   2492  2309532    3129050391    245020637      MANSI      SANDIP   
2493   2494  2310590    3121010063    245017112       IMAD       NAWAB   
2494   2495  2318938    3124040430    245057668      MATTE   DHANSHREE   
2498   2499  2330126    3109010579    245054760    GAIKWAD        NEHA   

                 last_name gender category                 quota  \
0                PRABHAKAR      M     SEBC 