In [1]:
import os
import re
import requests
import pandas as pd
import pypdf
from pypdf import PdfReader, PdfWriter
import ipywidgets as widgets
from IPython.display import HTML
from io import StringIO

In [2]:
def print_df(df):
    a3 = widgets.HTML(layout={"scroll": "auto", "height": "500px"})
    a3.value = df.to_html(notebook=True)
    display(a3)

In [3]:
def download_pdf_file(url: str) -> bool:
    """Download PDF from given URL to local directory.

    :param url: The url of the PDF file to be downloaded
    :return: True if PDF file was successfully downloaded, otherwise False.
    """

    # Request URL and get response object
    response = requests.get(url, stream=True)

    # isolate PDF filename from URL
    pdf_file_name = os.path.basename(url)
    if response.status_code == 200:
        # Save in current working directory
        filepath = os.path.join(os.getcwd() + "/centre_results/" , pdf_file_name)
        print(filepath)
        with open(filepath, 'wb') as pdf_object:
            pdf_object.write(response.content)
            print(f'{pdf_file_name} was successfully saved!')
            return True
    else:
        print(f'Uh oh! Could not download {pdf_file_name},')
        print(f'HTTP response status code: {response.status_code}')
        return False

In [4]:
def save_data_for_one_centre(centre_code=110101):
    url = "https://neetfs.ntaonline.in/NEET_2024_Result/" + str(centre_code) + ".pdf"
    print("downloading data for centre code " + str(centre_code))
    download_pdf_file(url)
    print("file downloaded")

In [5]:
def save_data_for_one_centre(centre_code=110101):
    url = "https://neetfs.ntaonline.in/NEET_2024_Result/" + str(centre_code) + ".pdf"
    print("downloading data for centre code " + str(centre_code))
    download_pdf_file(url)
    print("file downloaded")

In [6]:
def read_html_with_read_html(file_path):
    # Read HTML file into DataFrame using read_html()
    # file_path = "/home/randhir/Desktop/neet_result_analysis/HTML/neet_scam_data_result/Common_Scorecard_1.html"
    # file_path = "https://neet.ntaonline.in/frontend/web/common-scorecard/index"

    # Use pd.read_html to read the table
    print(file_path)
    try:
        # Since pd.read_html returns a list of DataFrames, we get the first one [0]
        df = pd.read_html(file_path)[0]

        # Display the first few rows of the DataFrame
        print(df.head())
        return df
    except ValueError as e:
        print("Error:", e)
        print("No tables found in the HTML file. Please check the structure of your HTML.")
   

def read_html_file_n(n=1):
    html_path = f"/home/randhir/Desktop/DS_Projects/neet_result_analysis/HTML/neet_scam_data_result/Common_Scorecard_{n}.html"
    return read_html_with_read_html(html_path)

In [7]:
centre_serial_df = []

for i in range(1,11):
    filei = read_html_file_n(i)
    centre_serial_df.append(filei)
    # print(f"i = {i}, ispresent = {filei[filei['CENTER NO.']==460712]}")

/home/randhir/Desktop/DS_Projects/neet_result_analysis/HTML/neet_scam_data_result/Common_Scorecard_1.html
   Sr.No.                      CENTER STATE CENTER CITY  \
0       1  ANDAMAN AND NICOBAR ISLANDS (UT)  PORT BLAIR   
1       2  ANDAMAN AND NICOBAR ISLANDS (UT)  PORT BLAIR   
2       3  ANDAMAN AND NICOBAR ISLANDS (UT)  PORT BLAIR   
3       4                    ANDHRA PRADESH      GUNTUR   
4       5                    ANDHRA PRADESH      GUNTUR   

                                CENTER NAME  CENTER NO.   View Result  
0                  KENDRIYA VIDYALAYA NO. 1      110101  View Details  
1   DR B R AMBEDKAR INSTITUTE OF TECHNOLOGY      110102  View Details  
2  GOVERNMENT MODEL SENIOR SECONDARY SCHOOL      110103  View Details  
3       LITTLE FLOWER ENGLISH MEDIUM SCHOOL      120101  View Details  
4        CHALAPATHI INSTITUTE OF TECHNOLOGY      120102  View Details  
/home/randhir/Desktop/DS_Projects/neet_result_analysis/HTML/neet_scam_data_result/Common_Scorecard_2.html
 

In [8]:
all_centres = pd.concat(centre_serial_df)
all_centres.rename(columns = {'Sr.No.':'centre_serial_no', 'CENTER STATE' : 'state', 'CENTER CITY' : 'city', 'CENTER NAME' : 'center_name', 'CENTER NO.' : 'center_no'}, inplace = True)

In [9]:
all_centres.to_parquet('all_centres.parquet', compression='gzip')

In [10]:
print(f"no of rows in all_centres = {all_centres['center_no'].count()}")
print(f"no of unique centres = {all_centres['center_no'].nunique()}")
# all_centres['center_no'].value_counts()
# all_centres[all_centres['center_no']==460712]

no of rows in all_centres = 4750
no of unique centres = 4750


In [11]:
centre_codes = all_centres['center_no'].tolist()

In [12]:
centre_codes

[110101,
 110102,
 110103,
 120101,
 120102,
 120103,
 120104,
 120105,
 120106,
 120107,
 120108,
 120201,
 120202,
 120203,
 120204,
 120205,
 120206,
 120207,
 120301,
 120302,
 120303,
 120304,
 120305,
 120401,
 120402,
 120403,
 120404,
 120405,
 120406,
 120407,
 120408,
 120409,
 120501,
 120502,
 120503,
 120504,
 120505,
 120506,
 120507,
 120508,
 120509,
 120510,
 120511,
 120512,
 120513,
 120514,
 120515,
 120516,
 120517,
 120518,
 120519,
 122301,
 122302,
 122401,
 122402,
 122403,
 122404,
 122405,
 122406,
 122407,
 122408,
 122501,
 122502,
 122503,
 122504,
 122505,
 122701,
 122801,
 122802,
 122803,
 122901,
 122902,
 122903,
 122904,
 122905,
 120520,
 120521,
 120522,
 120523,
 120524,
 120525,
 120526,
 120601,
 120602,
 120603,
 120604,
 120605,
 120606,
 120607,
 120608,
 120609,
 120610,
 120611,
 120612,
 120613,
 120701,
 120702,
 120801,
 120802,
 120901,
 121001,
 121201,
 121202,
 121203,
 121204,
 121301,
 121401,
 121501,
 121502,
 121503,
 121601,
 

In [15]:
for cc in centre_codes:
    fp = f"/home/randhir/Desktop/DS_Projects/neet_result_analysis/centre_results/{cc}.pdf"
    if not os.path.isfile(fp):
        print(f"file doesnt exist for centre code : {cc}")
        save_data_for_one_centre(cc)

In [None]:
def parse_center_result(cc = 121202):
    pdf_path = f"centre_results/{cc}.pdf"
    reader = PdfReader(pdf_path)
    
    page = reader.pages[0]
    texts = page.extract_text()
    # print(texts)
    
    start_idx = [m.start() for m in re.finditer('Srlno. Marks', texts)]
    print("start_idx: ", start_idx)
    print("texts: ", texts)
    results_cc = []
    
    for i in range(1, len(start_idx)):
        idx = start_idx[i-1]
        idx_1 = start_idx[i]
        results = []
        fno = None
        
        for x in texts[idx+15:idx_1].split(" "):
            print(x, ' ')
            if x=="":
                continue
            x = int(x)
            if fno==None:
                fno = x
            else:
                results.append((fno,x))
                fno = None
        results_cc.extend(results)
    df = pd.DataFrame(results_cc, columns=['roll_no', 'marks'])
    # print(df.head(5))
    df['center_no'] = cc
    # print(df.head(5))
    df['roll_no_cc'] = cc*10000 + df['roll_no'] 
    # print(df.head(5))
    return df