In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
# import time
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')


In [2]:
pd.set_option('display.max_columns', 200)

In [3]:
driver = webdriver.Firefox()

In [4]:
dd_id = {
    "ctl00_ContentPlaceHolder1_ddlroundno":"6",
    "ctl00_ContentPlaceHolder1_ddlInstype":"ALL",
    "ctl00_ContentPlaceHolder1_ddlInstitute":"ALL",
    "ctl00_ContentPlaceHolder1_ddlBranch":"ALL",
    "ctl00_ContentPlaceHolder1_ddlSeattype":"ALL"    
}

In [5]:
def dropdown(id):
    item = wait.until(EC.presence_of_element_located((By.ID,id)))
    driver.execute_script("arguments[0].style.display = 'block';", item)
    return Select(item)

In [6]:
driver.get("https://josaa.admissions.nic.in/Applicant/seatallotmentresult/currentorcr.aspx")
wait = WebDriverWait(driver, 20)

In [7]:
for x in dd_id:
    select_item = dropdown(x)
    select_item.select_by_value(dd_id[x])

In [8]:
submit = wait.until(EC.presence_of_element_located((By.ID, "ctl00_ContentPlaceHolder1_btnSubmit")))
submit.click()

In [9]:
soup = bs(driver.page_source, "html.parser")
driver.quit()

In [10]:
table = soup.find('table', attrs={'id':'ctl00_ContentPlaceHolder1_GridView1'})

In [11]:
headers = []
for header in table.find_all('th'):
    headers.append(header.text.strip())

In [12]:
data = []
for row in table.find_all('tr'):
    row_data = []
    for cell in row.find_all('td'):
        row_data.append(cell.text.strip())
    data.append(row_data)

In [13]:
df = pd.DataFrame(data, columns=headers)

In [14]:
df.shape

(10366, 7)

In [15]:
df.head(10)

Unnamed: 0,Institute,Academic Program Name,Quota,Seat Type,Gender,Opening Rank,Closing Rank
0,,,,,,,
1,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Gender-Neutral,12511.0,14997.0
2,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Female-only (including Supernumerary),18989.0,21229.0
3,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Gender-Neutral,1852.0,2341.0
4,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Female-only (including Supernumerary),3354.0,3354.0
5,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Gender-Neutral,4326.0,5274.0
6,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Female-only (including Supernumerary),5905.0,7192.0
7,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,SC,Gender-Neutral,1941.0,2799.0
8,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,SC,Female-only (including Supernumerary),3985.0,3985.0
9,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,ST,Gender-Neutral,798.0,858.0


In [16]:
df.columns

Index(['Institute', 'Academic Program Name', 'Quota', 'Seat Type', 'Gender',
       'Opening Rank', 'Closing Rank'],
      dtype='object')

In [17]:
df.rename(columns={'Academic Program Name':'Branch', 'Seat Type':'Category'},inplace=True)

In [18]:
df.dtypes

Institute       object
Branch          object
Quota           object
Category        object
Gender          object
Opening Rank    object
Closing Rank    object
dtype: object

In [19]:
df.describe()

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
count,10365,10365,10365,10365,10365,10365,10365
unique,119,211,6,10,2,7750,8120
top,"National Institute of Technology, Rourkela","Computer Science and Engineering (4 Years, Bac...",AI,OPEN,Gender-Neutral,42,138
freq,389,1412,4548,2097,6468,8,7


In [20]:
df.head()

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
0,,,,,,,
1,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Gender-Neutral,12511.0,14997.0
2,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Female-only (including Supernumerary),18989.0,21229.0
3,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Gender-Neutral,1852.0,2341.0
4,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Female-only (including Supernumerary),3354.0,3354.0


In [21]:
df.drop(index=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [22]:
df.head()

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
0,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Gender-Neutral,12511,14997
1,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Female-only (including Supernumerary),18989,21229
2,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Gender-Neutral,1852,2341
3,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Female-only (including Supernumerary),3354,3354
4,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Gender-Neutral,4326,5274


In [23]:
df.shape

(10365, 7)

In [24]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank


In [25]:
df.loc[df.duplicated()]

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank


In [26]:
#if any non numbers in Opening and Closing Ranks
df[~df['Opening Rank'].str.match('^\d+$') | ~df['Closing Rank'].str.match('^\d+$')]

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
67,Indian Institute of Technology Bhubaneswar,"Electrical Engineering (4 Years, Bachelor of T...",AI,OPEN (PwD),Gender-Neutral,150P,150P
79,Indian Institute of Technology Bhubaneswar,Electronics and Communication Engineering (4 Y...,AI,OPEN (PwD),Female-only (including Supernumerary),202P,202P
84,Indian Institute of Technology Bhubaneswar,Electronics and Communication Engineering (4 Y...,AI,OBC-NCL (PwD),Gender-Neutral,21P,21P
137,Indian Institute of Technology Bhubaneswar,Metallurgical and Materials Engineering (5 Yea...,AI,ST,Female-only (including Supernumerary),101P,101P
146,Indian Institute of Technology Bombay,"Aerospace Engineering (4 Years, Bachelor of Te...",AI,OBC-NCL (PwD),Gender-Neutral,74P,74P
...,...,...,...,...,...,...,...
2984,Indian Institute of Technology Dharwad,"Electrical Engineering (4 Years, Bachelor of T...",AI,OPEN (PwD),Gender-Neutral,104P,104P
3008,Indian Institute of Technology Dharwad,"Interdisciplinary Sciences (5 Years, Bachelor ...",AI,ST,Female-only (including Supernumerary),638P,638P
3011,Indian Institute of Technology Dharwad,"Mathematics and Computing (4 Years, Bachelor o...",AI,OPEN (PwD),Gender-Neutral,155P,155P
5945,National Institute of Technology Sikkim,"Civil Engineering (4 Years, Bachelor of Techno...",HS,OPEN,Gender-Neutral,573994,1005918.0


In [27]:
prepdf = df[~df['Opening Rank'].str.match('^\d+$') | ~df['Closing Rank'].str.match('^\d+$')]

In [28]:
prepdf

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
67,Indian Institute of Technology Bhubaneswar,"Electrical Engineering (4 Years, Bachelor of T...",AI,OPEN (PwD),Gender-Neutral,150P,150P
79,Indian Institute of Technology Bhubaneswar,Electronics and Communication Engineering (4 Y...,AI,OPEN (PwD),Female-only (including Supernumerary),202P,202P
84,Indian Institute of Technology Bhubaneswar,Electronics and Communication Engineering (4 Y...,AI,OBC-NCL (PwD),Gender-Neutral,21P,21P
137,Indian Institute of Technology Bhubaneswar,Metallurgical and Materials Engineering (5 Yea...,AI,ST,Female-only (including Supernumerary),101P,101P
146,Indian Institute of Technology Bombay,"Aerospace Engineering (4 Years, Bachelor of Te...",AI,OBC-NCL (PwD),Gender-Neutral,74P,74P
...,...,...,...,...,...,...,...
2984,Indian Institute of Technology Dharwad,"Electrical Engineering (4 Years, Bachelor of T...",AI,OPEN (PwD),Gender-Neutral,104P,104P
3008,Indian Institute of Technology Dharwad,"Interdisciplinary Sciences (5 Years, Bachelor ...",AI,ST,Female-only (including Supernumerary),638P,638P
3011,Indian Institute of Technology Dharwad,"Mathematics and Computing (4 Years, Bachelor o...",AI,OPEN (PwD),Gender-Neutral,155P,155P
5945,National Institute of Technology Sikkim,"Civil Engineering (4 Years, Bachelor of Techno...",HS,OPEN,Gender-Neutral,573994,1005918.0


In [29]:
df_orig = df.copy()

In [30]:
df_orig

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
0,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Gender-Neutral,12511,14997
1,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Female-only (including Supernumerary),18989,21229
2,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Gender-Neutral,1852,2341
3,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Female-only (including Supernumerary),3354,3354
4,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Gender-Neutral,4326,5274
...,...,...,...,...,...,...,...
10360,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,OPEN,Gender-Neutral,86070,101502
10361,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,EWS,Gender-Neutral,16842,17273
10362,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,OBC-NCL,Gender-Neutral,31415,33784
10363,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,SC,Gender-Neutral,11685,15633


In [31]:
df.drop(prepdf.index, inplace=True)

In [32]:
df

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
0,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Gender-Neutral,12511,14997
1,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Female-only (including Supernumerary),18989,21229
2,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Gender-Neutral,1852,2341
3,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Female-only (including Supernumerary),3354,3354
4,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Gender-Neutral,4326,5274
...,...,...,...,...,...,...,...
10360,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,OPEN,Gender-Neutral,86070,101502
10361,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,EWS,Gender-Neutral,16842,17273
10362,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,OBC-NCL,Gender-Neutral,31415,33784
10363,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,SC,Gender-Neutral,11685,15633


In [33]:
df[~df['Opening Rank'].str.match('^\d+$')]

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank


In [34]:
df.dtypes

Institute       object
Branch          object
Quota           object
Category        object
Gender          object
Opening Rank    object
Closing Rank    object
dtype: object

In [35]:
df['Opening Rank'] = df['Opening Rank'].astype(str).astype(int)
df['Closing Rank'] = df['Closing Rank'].astype(str).astype(int)

In [36]:
df.dtypes

Institute       object
Branch          object
Quota           object
Category        object
Gender          object
Opening Rank     int64
Closing Rank     int64
dtype: object

In [37]:
df

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
0,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Gender-Neutral,12511,14997
1,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Female-only (including Supernumerary),18989,21229
2,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Gender-Neutral,1852,2341
3,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Female-only (including Supernumerary),3354,3354
4,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Gender-Neutral,4326,5274
...,...,...,...,...,...,...,...
10360,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,OPEN,Gender-Neutral,86070,101502
10361,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,EWS,Gender-Neutral,16842,17273
10362,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,OBC-NCL,Gender-Neutral,31415,33784
10363,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,SC,Gender-Neutral,11685,15633


In [38]:
df.reset_index(drop=True, inplace=True)

In [39]:
df

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
0,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Gender-Neutral,12511,14997
1,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Female-only (including Supernumerary),18989,21229
2,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Gender-Neutral,1852,2341
3,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Female-only (including Supernumerary),3354,3354
4,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Gender-Neutral,4326,5274
...,...,...,...,...,...,...,...
10229,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,OPEN,Gender-Neutral,86070,101502
10230,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,EWS,Gender-Neutral,16842,17273
10231,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,OBC-NCL,Gender-Neutral,31415,33784
10232,"Indian Institute of Handloom Technology, Salem","Handloom and Textile Technology (4 Years, Bach...",AI,SC,Gender-Neutral,11685,15633


In [42]:
df.describe()

Unnamed: 0,Opening Rank,Closing Rank
count,10234.0,10234.0
mean,11801.206859,14611.250049
std,29086.56179,38867.614505
min,1.0,1.0
25%,1457.75,1754.5
50%,4386.0,5297.0
75%,11767.5,14064.5
max,834722.0,953240.0


In [43]:
df.to_csv(r'../data/josaa.csv')

In [40]:
prepdf.reset_index(drop=True, inplace=True)

In [41]:
prepdf

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
0,Indian Institute of Technology Bhubaneswar,"Electrical Engineering (4 Years, Bachelor of T...",AI,OPEN (PwD),Gender-Neutral,150P,150P
1,Indian Institute of Technology Bhubaneswar,Electronics and Communication Engineering (4 Y...,AI,OPEN (PwD),Female-only (including Supernumerary),202P,202P
2,Indian Institute of Technology Bhubaneswar,Electronics and Communication Engineering (4 Y...,AI,OBC-NCL (PwD),Gender-Neutral,21P,21P
3,Indian Institute of Technology Bhubaneswar,Metallurgical and Materials Engineering (5 Yea...,AI,ST,Female-only (including Supernumerary),101P,101P
4,Indian Institute of Technology Bombay,"Aerospace Engineering (4 Years, Bachelor of Te...",AI,OBC-NCL (PwD),Gender-Neutral,74P,74P
...,...,...,...,...,...,...,...
126,Indian Institute of Technology Dharwad,"Electrical Engineering (4 Years, Bachelor of T...",AI,OPEN (PwD),Gender-Neutral,104P,104P
127,Indian Institute of Technology Dharwad,"Interdisciplinary Sciences (5 Years, Bachelor ...",AI,ST,Female-only (including Supernumerary),638P,638P
128,Indian Institute of Technology Dharwad,"Mathematics and Computing (4 Years, Bachelor o...",AI,OPEN (PwD),Gender-Neutral,155P,155P
129,National Institute of Technology Sikkim,"Civil Engineering (4 Years, Bachelor of Techno...",HS,OPEN,Gender-Neutral,573994,1005918.0


In [65]:
prepdf['Opening Rank']

0      150P
1      202P
2       21P
3      101P
4       74P
       ... 
114    662P
115    316P
116    104P
117    638P
118    155P
Name: Opening Rank, Length: 119, dtype: object

In [66]:
prepdf

Unnamed: 0,Institute,Branch,Quota,Category,Gender,Opening Rank,Closing Rank
0,Indian Institute of Technology Bhubaneswar,"Electrical Engineering (4 Years, Bachelor of T...",AI,OPEN (PwD),Gender-Neutral,150P,150P
1,Indian Institute of Technology Bhubaneswar,Electronics and Communication Engineering (4 Y...,AI,OPEN (PwD),Female-only (including Supernumerary),202P,202P
2,Indian Institute of Technology Bhubaneswar,Electronics and Communication Engineering (4 Y...,AI,OBC-NCL (PwD),Gender-Neutral,21P,21P
3,Indian Institute of Technology Bhubaneswar,Metallurgical and Materials Engineering (5 Yea...,AI,ST,Female-only (including Supernumerary),101P,101P
4,Indian Institute of Technology Bombay,"Aerospace Engineering (4 Years, Bachelor of Te...",AI,OBC-NCL (PwD),Gender-Neutral,74P,74P
...,...,...,...,...,...,...,...
114,Indian Institute of Technology Jammu,"Materials Engineering (4 Years, Bachelor of Te...",AI,ST,Female-only (including Supernumerary),662P,662P
115,Indian Institute of Technology Jammu,"Mechanical Engineering (4 Years, Bachelor of T...",AI,ST,Female-only (including Supernumerary),316P,316P
116,Indian Institute of Technology Dharwad,"Electrical Engineering (4 Years, Bachelor of T...",AI,OPEN (PwD),Gender-Neutral,104P,104P
117,Indian Institute of Technology Dharwad,"Interdisciplinary Sciences (5 Years, Bachelor ...",AI,ST,Female-only (including Supernumerary),638P,638P
