In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from textwrap import wrap
import re

In [2]:
# URLs
file_path = './data/Industry-Indices.xlsx'
tse_main_url = 'http://www.tsetmc.com/Loader.aspx?ParTree=15'
industries_operation_url = 'http://www.tsetmc.com/Loader.aspx?Partree=15131O'

## Fetching data

In [3]:
res = requests.get(url=tse_main_url)
res.text[:300]

'<!doctype html><html><head><title>.:TSETMC:. :: شرکت مدیریت فناوری بورس تهران</title><script>var LongRunnigPagesSite=\'http://cdn3.tsetmc.com\';function ens(ty,sv){var lv=localStorage.getItem("v_"+ty);var t;if (1==1 || lv!=sv){var oX=new XMLHttpRequest();oX.open(\'GET\', \'tsev2/res/loader.aspx?t=\'+ty+\'&'

In [4]:
def get_and_parse_url(url, params=None):
    """
        get a URL, grab the data, and return a BeautifulSoup object with parsed data
    """
    res = requests.get(url=url, params=params)
    soup = BeautifulSoup(markup=res.text, features='html.parser')
    return soup

In [5]:
soup = get_and_parse_url(url=industries_operation_url)
print(soup.prettify()[:500])

<!DOCTYPE doctype html>
<html>
 <head>
  <title>
   .:TSETMC:. :: برترین گروه های صنعت
  </title>
  <script>
   var LongRunnigPagesSite='http://cdn3.tsetmc.com';function ens(ty,sv){var lv=localStorage.getItem("v_"+ty);var t;if (1==1 || lv!=sv){var oX=new XMLHttpRequest();oX.open('GET', 'tsev2/res/loader.aspx?t='+ty+'&_'+sv,false);oX.send(null);t=oX.responseText;if(t[t.length-1]!=';') return;localStorage.setItem("v_"+ty,sv);localStorage.setItem("t_"+ty,t)}else{t=localStorage.getItem("t_"+ty)}if (


## Searching for required data and preparing them

In [6]:
rows = soup.tbody.find_all(name='tr')
len(rows)

41

In [7]:
print(rows[:2])

[<tr><td>شاخص صنعت</td>
<td><div class="ltr" title="7,650,169,488,403,786">7,650,169.488 B</div></td>
<td>236,046</td>
<td><div class="ltr" title="2,739,895,884">2.740 B</div></td>
<td><div class="ltr" title="10,212,890,935,746">10,212.891 B</div></td>
</tr>, <tr><td>34-خودرو</td>
<td><div class="ltr" title="269,025,262,403,296">269,025.262 B</div></td>
<td>62,425</td>
<td><div class="ltr" title="1,398,654,660">1.399 B</div></td>
<td><div class="ltr" title="2,610,194,121,728">2,610.194 B</div></td>
</tr>]


In [8]:
rows = [r.find_all('td') for r in rows]

In [9]:
# col1: Group
# col2: Market-Value
# col3: Transactions-Number
# col4: Transactions-Volume
# col5: Transactions-Value
rows[1]

[<td>34-خودرو</td>,
 <td><div class="ltr" title="269,025,262,403,296">269,025.262 B</div></td>,
 <td>62,425</td>,
 <td><div class="ltr" title="1,398,654,660">1.399 B</div></td>,
 <td><div class="ltr" title="2,610,194,121,728">2,610.194 B</div></td>]

In [10]:
# get cells string and save them in a list
values = []
for row in rows:
    for col in row:
        values.append(col.string)
        
values[:10]

['شاخص صنعت',
 '7,650,169.488 B',
 '236,046',
 '2.740 B',
 '10,212.891 B',
 '34-خودرو',
 '269,025.262 B',
 '62,425',
 '1.399 B',
 '2,610.194 B']

In [11]:
# split the values into sized 5 chunks to represent each row in a list item
values = [values[i:i+5] for i in range(0, len(values), 5)]
values[:2]

[['شاخص صنعت', '7,650,169.488 B', '236,046', '2.740 B', '10,212.891 B'],
 ['34-خودرو', '269,025.262 B', '62,425', '1.399 B', '2,610.194 B']]

In [12]:
# remove the ',', 'B' and 'M' from the recieved string and convert it to a float number
def purify_number(number):
    number = str(number)
    number = number.split(',')
    number = ''.join(number)
    number = number.strip()

    if 'B' in number:
        number = number.strip('B')
        number = float(number) * 1000000000
    elif 'M' in number:
        number = number.strip('M')
        number = float(number) * 1000000
    else:
        number = float(number)

    return number

In [13]:
a = purify_number('   141.383 B    ')
a

141383000000.0

In [14]:
# cleaning all the numbers
for i in range(len(values)):
    for j in range(1, len(values[0])):
        values[i][j] = purify_number(values[i][j])

In [15]:
# now data is clean and ready to save
values[:3]

[['شاخص صنعت', 7650169488000000.0, 236046.0, 2740000000.0, 10212891000000.0],
 ['34-خودرو', 269025262000000.0, 62425.0, 1399000000.0, 2610194000000.0],
 ['57-بانكها', 644373421000000.0, 50274.0, 1447000000.0, 1194232000000.0]]

## Saving the clean data into excel file

In [16]:
# read the original excel file
df_main = pd.read_excel(file_path)
df_main.head()

Unnamed: 0,CDate,JDate,GroupNo,GroupName,MarketValue,TransactionsCount,TransactionsVol,TransactionsValue


In [17]:
# find the group number using regex
def parse_group_no(text):
    if re.search('\d+', text):
        group_no = re.findall('\d+', text)[0]
    else:
        group_no = '0'
        
    return group_no

**_Set date manually here_**

In [18]:
# prepare a dict of values for creating a DataFrame
CDate = '20190603' * len(values)
CDate = wrap(text=CDate, width=8)
JDate = '13980313' * len(values)
JDate = wrap(text=JDate, width=8)
data = {
    'CDate': CDate,
    'JDate': JDate,
    'GroupNo': [parse_group_no(values[i][0]) for i in range(len(values))],
    'GroupName': [(values[i][0]).encode('utf-8') for i in range(len(values))],
    'MarketValue': [values[i][1] for i in range(len(values))],
    'TransactionsCount': [values[i][2] for i in range(len(values))],
    'TransactionsVol': [values[i][3] for i in range(len(values))],
    'TransactionsValue': [values[i][4] for i in range(len(values))]
}

In [19]:
data

{'CDate': ['20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603',
  '20190603'],
 'JDate': ['13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
  '13980313',
 

In [20]:
# create new DataFrame with recently fetched data
df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,CDate,JDate,GroupNo,GroupName,MarketValue,TransactionsCount,TransactionsVol,TransactionsValue
0,20190603,13980313,0,b'\xd8\xb4\xd8\xa7\xd8\xae\xd8\xb5 \xd8\xb5\xd...,7650169000000000.0,236046.0,2740000000.0,10212890000000.0
1,20190603,13980313,34,b'34-\xd8\xae\xd9\x88\xd8\xaf\xd8\xb1\xd9\x88',269025300000000.0,62425.0,1399000000.0,2610194000000.0
2,20190603,13980313,57,b'57-\xd8\xa8\xd8\xa7\xd9\x86\xd9\x83\xd9\x87\...,644373400000000.0,50274.0,1447000000.0,1194232000000.0
3,20190603,13980313,44,b'44-\xd8\xb4\xd9\x8a\xd9\x85\xd9\x8a\xd8\xa7\...,1900527000000000.0,20390.0,172604000.0,989110000000.0
4,20190603,13980313,27,b'27-\xd9\x81\xd9\x84\xd8\xb2\xd8\xa7\xd8\xaa ...,1389537000000000.0,15898.0,112454000.0,729291000000.0


In [21]:
# read the original excel file
df_main = pd.read_excel(file_path)
df_main.head()

Unnamed: 0,CDate,JDate,GroupNo,GroupName,MarketValue,TransactionsCount,TransactionsVol,TransactionsValue


In [22]:
# combine recently created DataFrame and the original one
df1 = pd.concat([df_main, df], ignore_index=True, sort=False)
df1.head()

Unnamed: 0,CDate,JDate,GroupNo,GroupName,MarketValue,TransactionsCount,TransactionsVol,TransactionsValue
0,20190603,13980313,0,b'\xd8\xb4\xd8\xa7\xd8\xae\xd8\xb5 \xd8\xb5\xd...,7650169000000000.0,236046.0,2740000000.0,10212890000000.0
1,20190603,13980313,34,b'34-\xd8\xae\xd9\x88\xd8\xaf\xd8\xb1\xd9\x88',269025300000000.0,62425.0,1399000000.0,2610194000000.0
2,20190603,13980313,57,b'57-\xd8\xa8\xd8\xa7\xd9\x86\xd9\x83\xd9\x87\...,644373400000000.0,50274.0,1447000000.0,1194232000000.0
3,20190603,13980313,44,b'44-\xd8\xb4\xd9\x8a\xd9\x85\xd9\x8a\xd8\xa7\...,1900527000000000.0,20390.0,172604000.0,989110000000.0
4,20190603,13980313,27,b'27-\xd9\x81\xd9\x84\xd8\xb2\xd8\xa7\xd8\xaa ...,1389537000000000.0,15898.0,112454000.0,729291000000.0


In [23]:
# Save the output to excel file
df1.to_excel(excel_writer=file_path, index=False)