# Get all the constellation wiki pages links from https://en.wikipedia.org/wiki/Lists_of_stars_by_constellation

In [None]:
import requests
from pandas.io.html import read_html
import pandas as pd
import re
import numpy as np
# import only html class
from lxml import html 
from urllib.error import HTTPError

final=['Name','Constellation',
 'Right ascension',
 'Declination',
 'Apparent\xa0magnitude\xa0(V)',
 'Spectral\xa0type',
 'U−B color index',
 'B−V color index',
 'V−R color index',
 'R−I color index',
 'Variable\xa0type',
 'Radial velocity (Rv)',
 'Proper motion (μ)',
 'Parallax (π)',
 'Distance',
 'Absolute\xa0magnitude\xa0(MV)',
 'Mass',
 'Radius',
 'Luminosity',
 'Surface gravity (log\xa0g)',
 'Temperature',
 'Metallicity [Fe/H]',
 'Rotational velocity (v\xa0sin\xa0i)',
 'Age',
 'Period (P)',
 'Semi-major axis (a)',
 'Eccentricity (e)',
 'Inclination (i)',
 'Longitude of the node (Ω)',
 'Periastron epoch (T)',
 'Argument of periastron (ω)(secondary)']


url='https://en.wikipedia.org/wiki/Lists_of_stars_by_constellation'



def get_constellation_links(url):
    const_links=[]
    response = requests.get(url)
    byte_data = response.content
    source_code = html.fromstring(byte_data)
    const_links=[]
    for i in range(1,5):
        path1='//table/tbody/tr[1]/td['+str(i)+']/ul/li/a'
        tree = source_code.xpath(path1)
        for x in range(0,len(tree)):
            x=tree[x].get('href')
            wiki_path="https://en.wikipedia.org"+x
            const_links.append(wiki_path)

    return const_links







# Each Star has Unique HIP ID under HIP column.Also the HIP column number is dynamic. The below code provided column number

In [None]:
def get_HIP_col_no(const):
        path44="//div[@id='mw-content-text']/div/table//th"
        i=0
        response = requests.get(const)
        byte_data = response.content
        source_code = html.fromstring(byte_data)
        tree = source_code.xpath(path44)
        count=1
        for i in tree:
            a=i.text_content().strip()
            if a!='HIP':
                count=count+1
            else:
                return count

# Iterate over the constellation links and get the list of wiki pages for stars under each constellation and UNIQUE HIP ID for the star

In [None]:
def get_star_links(const_links):
    title_page=[]
    star_links=[]
    title_HIP={}

    for const in const_links:
        col_no=get_HIP_col_no(const)
        path1='//table/tbody/tr'
        i=0
        response = requests.get(const)
        byte_data = response.content
        source_code = html.fromstring(byte_data)
        tree = source_code.xpath(path1)
        len(tree)
        for i in range(len(tree)):
            path = '//table/tbody/tr['+str(i)+']/td/a[1]'
            path1='//table/tbody/tr['+str(i)+']/td['+str(col_no)+']'
            tree = source_code.xpath(path)
            if(len(tree)>0):
                star_links.append(tree[0].get('href'))
                name=tree[0].get('title')
                print(name)
                tree_hip=source_code.xpath(path1)
                if(len(tree_hip)>0):
                    HIP=tree_hip[0].text_content()
                    title_HIP[name]=HIP
            xpath_1="//*[@class='firstHeading']"
            tree_title = source_code.xpath(xpath_1)
            title=tree_title[0].text_content()  
            title_page.append(title)
    
    return star_links,title_page,title_HIP




# Cell Description:Calling the functions made above to get the data

In [None]:
constellation_wiki=get_constellation_links(url)
stars_wiki,page_titles,hip=get_star_links(constellation_wiki)

# Creating unique links from href obtained from above functions

In [None]:
stars=[]
for i in stars_wiki:
    wiki_path="https://en.wikipedia.org"+i
    stars.append(wiki_path)
stars

# Dividing the list of star links into samples

In [None]:
len(stars)

In [None]:
Sample1=stars[0:10]

In [None]:
Sample2=stars[500:1200]

In [None]:
Sample3=stars[1200:2000]

In [None]:
Sample4=stars[2000:2800]

In [None]:
Sample5=stars[2800:3600]

In [None]:
Sample6=stars[3600:4400]

In [None]:
Sample7=stars[4400:5200]

In [None]:
Sample8=stars[5200:6000]

In [None]:
Sample9=stars[6000:6800]

In [None]:
Sample10=stars[6800:7600]

In [None]:
Sample11=stars[7600:8400]

In [None]:
Sample12=stars[8400:]
len(Sample12)

# Creating Blank Data Frame

In [None]:
df=pd.DataFrame(columns=final)
df

# Code to get star details from each link

In [None]:
def get_star_details(stars,df):
    for links in stars:
        response = requests.get(links)
        byte_data = response.content
        source_code = html.fromstring(byte_data)
        xpath="//*[@class='firstHeading']"
# jump to preferred html element
        tree = source_code.xpath(xpath)
        print(tree[0].text_content())
        if(tree[0].text_content() in page_titles):
            continue
        try:    
            infoboxes = read_html(links, index_col=0,attrs={"class":"infobox"})
        except (ValueError,NameError,HTTPError):
            continue
        
        list_of_values=[]

        for x in final:
            try: 
                z=infoboxes[0].xs(x).values[0]
                if isinstance(z,(str)):
                    y=z.encode("utf-8","ignore")
                    y=y.decode()
                    list_of_values.append(y)
                elif isinstance(z,(np.ndarray)):
                    if(len(z)>=1):
                        a=z[0]
                        a=a.encode("utf-8","ignore")
                        a=a.decode()
                        list_of_values.append(a)
                else:
                    list_of_values.append("")
            except (KeyError,NameError):
                list_of_values.append("")
        list_of_values[0]=tree[0].text_content()
        final_val=[]
        for x in list_of_values:
            t=x.replace('\xa0',' ')
            t=re.sub(r'\[\d+\]', '', t)
            final_val.append(t)
        f_series=pd.Series(final_val,index=df.columns)
        df=df.append(f_series,ignore_index=True)
        
    return df
    

# Getting star details for each sample

In [None]:
stars_data_1=get_star_details(Sample1,df)


In [None]:
stars_data_2=get_star_details(Sample2,df)

In [None]:
stars_data_3=get_star_details(Sample3,df)

In [None]:
stars_data_4=get_star_details(Sample4,df)

In [None]:
stars_data_5=get_star_details(Sample5,df)

In [None]:
stars_data_6=get_star_details(Sample6,df)

In [None]:
stars_data_7=get_star_details(Sample7,df)

In [None]:
stars_data_8=get_star_details(Sample8,df)

In [None]:
stars_data_9=get_star_details(Sample9,df)

In [None]:
stars_data_10=get_star_details(Sample10,df)

In [None]:
stars_data_11=get_star_details(Sample11,df)

In [None]:
stars_data_12=get_star_details(Sample12,df)

# Combining Frames to a list of framed

In [None]:
frames=[stars_data_1,stars_data_2,stars_data_3,stars_data_4,stars_data_5,stars_data_6,stars_data_7,
        stars_data_8,stars_data_9,stars_data_10,stars_data_11,stars_data_12]

# Creating a final DF and putting data to excel

In [None]:
final_data_frame=pd.concat(frames,ignore_index=True)

In [None]:
final_data_frame

In [None]:
final_data_frame.to_excel("stars_final.xlsx")