### importing important packages that we needed

In [1]:
import requests
from bs4 import BeautifulSoup

#### fetching url by request

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_population'
response = requests.get(url)
print(response)

<Response [200]>


#### converting the fetched response into html_text

In [3]:
html_text = response.text

#### creating soup of data to access the data by html tags and classes

In [4]:
soup = BeautifulSoup(html_text , 'lxml')

#### Now with help of soup we can access the html tags by using find or find_all method

In [5]:
data = soup.find_all('table')[1]

In [6]:
# here we got our table we can check it by printing data this will contains all html data of the 
# table we want 

In [7]:
raw_headers = data.find_all('th')[1:]    # here we accsing the headers of table        

In [8]:
cleaned_headers = []
for header in raw_headers:
    cleaned_headers.append(header.text.strip())
    
print(cleaned_headers)

['Rank', 'State or Union Territory', 'Population[18][19]', '%India', 'Growth', 'Rural pop.', '%rural', 'Urban pop.', '%urban', 'Density[a]', 'Sex ratio', 'Lok Sabha seats', 'Rajya Sabha seats']


### now its time to get information of table 

In [9]:
raw_info = data.find_all('tr')[2:]    # index 2 because the data starts from the 3 tr in table 

In [10]:
clean_info = []
for row in raw_info:
    row_data = row.find_all('td')
    individual_info = [data.text.strip() for data in row_data]
    individual_info[0] = (individual_info[0].split('('))[0]          # this line is to remove '1(s1)'
    clean_info.append(individual_info)                               #    s1 from rank of all states

In [11]:
print(clean_info)

[['1 ', 'Uttar Pradesh', '199,812,341', '16.51%', '20.2%', '155,317,278', '77.73%', '44,495,063', '22.27%', '828', '912', '80', '31'], ['2 ', 'Maharashtra', '112,374,333', '9.28%', '16%', '61,556,074', '54.78%', '50,818,259', '45.22%', '365', '929', '48', '19'], ['3 ', 'Bihar', '104,099,452', '8.6%', '25.4%', '92,341,436', '88.71%', '11,758,016', '11.29%', '1102', '918', '40', '16'], ['4 ', 'West Bengal', '91,276,115', '7.54%', '13.8%', '62,183,113', '68.13%', '29,093,002', '31.87%', '1029', '953', '42', '16'], ['5 ', 'Madhya Pradesh', '72,626,809', '6%', '20.3%', '52,557,404', '72.37%', '20,069,405', '27.63%', '236', '931', '29', '11'], ['6 ', 'Tamil Nadu', '72,147,030', '5.96%', '15.6%', '37,229,590', '51.6%', '34,917,440', '48.4%', '555', '996', '39', '18'], ['7 ', 'Rajasthan', '68,548,437', '5.66%', '21.3%', '51,500,352', '75.13%', '17,048,085', '24.87%', '201', '928', '25', '10'], ['8 ', 'Karnataka', '61,095,297', '5.05%', '15.6%', '37,469,335', '61.33%', '23,625,962', '38.67%', '

#  Now we have both headers and data also Now we create pandas data frame to store our data in proper way 

In [12]:
import pandas as pd 

### creating data frame by headers 

In [13]:
df = pd.DataFrame(columns = cleaned_headers )
df

Unnamed: 0,Rank,State or Union Territory,Population[18][19],%India,Growth,Rural pop.,%rural,Urban pop.,%urban,Density[a],Sex ratio,Lok Sabha seats,Rajya Sabha seats


### now we will insert data in our data frame 

In [14]:
for row in range(len(clean_info)):
    df.loc[row] = clean_info[row]
    

In [15]:
df

Unnamed: 0,Rank,State or Union Territory,Population[18][19],%India,Growth,Rural pop.,%rural,Urban pop.,%urban,Density[a],Sex ratio,Lok Sabha seats,Rajya Sabha seats
0,1,Uttar Pradesh,199812341,16.51%,20.2%,155317278,77.73%,44495063,22.27%,828.0,912,80,31
1,2,Maharashtra,112374333,9.28%,16%,61556074,54.78%,50818259,45.22%,365.0,929,48,19
2,3,Bihar,104099452,8.6%,25.4%,92341436,88.71%,11758016,11.29%,1102.0,918,40,16
3,4,West Bengal,91276115,7.54%,13.8%,62183113,68.13%,29093002,31.87%,1029.0,953,42,16
4,5,Madhya Pradesh,72626809,6%,20.3%,52557404,72.37%,20069405,27.63%,236.0,931,29,11
5,6,Tamil Nadu,72147030,5.96%,15.6%,37229590,51.6%,34917440,48.4%,555.0,996,39,18
6,7,Rajasthan,68548437,5.66%,21.3%,51500352,75.13%,17048085,24.87%,201.0,928,25,10
7,8,Karnataka,61095297,5.05%,15.6%,37469335,61.33%,23625962,38.67%,319.0,973,28,12
8,9,Gujarat,60439692,4.99%,19.3%,34694609,57.4%,25745083,42.6%,308.0,919,26,11
9,10,Andhra Pradesh,"49,577,103[b]",4.1%,11.0%,34966693,70.53%,14610410,29.47%,303.0,993,25,11


# Here is our cleaned data and stored in pandas Data Frame 

## Now we will store it in csv file 

In [20]:
df.to_csv(r"S:\Scraped_and_cleaned_data.csv" , index = False)

In [17]:
# from the above command it will go in S drive's Scraped_and_cleaned_data and save it in csv format

In [19]:
df.to_json(r"S:\Scraped_and_cleaned_data.json" )