In [1]:
import feedparser
import pandas as pd
from datetime import date, timedelta
from sqlalchemy import create_engine

engine = create_engine("sqlite:///c:\\ruby\\portlt\\db\\development.sqlite3")
conlt = engine.connect()

clean_path = "../data/clean/"
raw_path = "../data/raw/"
csv_path = "\\Users\\User\\iCloudDrive\\"
box_path = "\\Users\\User\\Dropbox\\"
one_path = "\\Users\\User\\OneDrive\\Documents\\Data\\"
pdf_path = "../PDF/"

pd.set_option('display.max_colwidth', 255)
pd.set_option('display.max_rows',None)
url = "https://feeds.feedburner.com/Setorth-form45-en"

today = date.today()
year = 2022
mmdd_str = today.strftime('%m%d')
mmdd_str

'0514'

In [2]:
today = date(2022, 5, 13)
mmdd_str = today.strftime('%m%d')
mmdd_str

'0513'

In [3]:
rss_source = feedparser.parse(url)
f45_number = len(rss_source.entries)
print("Number of F45: ", f45_number)

Number of F45:  240


In [4]:
f45_items = []

for x in range(f45_number):
    f45_content = rss_source.entries[x]
    f45_item = {}
    
    print("\n----------------------------------\n")
    
    print("F45: " + str(x))
    title_ary = f45_content.title.partition(' ')
    f45_item['name'] = title_ary[0].strip() 
    print("Title: ", f45_item['name'])  
    f45_item['year'] = year
    print("Year: ", f45_item['year'])      
    qtr_ary = title_ary[2].partition(' (F45)')
    f45_item['quarter'] = qtr_ary[0][-1]    
    print("Quarter: ", f45_item['quarter'])    
    f45_item['link'] = f45_content.link
    print("Link: ", f45_item['link'])
    f45_item['published'] = f45_content.published
    print("Published: ", f45_item['published'])  
    f45_items.append(f45_item)


----------------------------------

F45: 0
Title:  FPI
Year:  2022
Quarter:  1
Link:  https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982171260
Published:  Fri, 13 May 2022 22:12:53 +0700

----------------------------------

F45: 1
Title:  SEAOIL
Year:  2022
Quarter:  1
Link:  https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170940
Published:  Fri, 13 May 2022 21:55:03 +0700

----------------------------------

F45: 2
Title:  SENA
Year:  2022
Quarter:  1
Link:  https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170660
Published:  Fri, 13 May 2022 21:48:56 +0700

----------------------------------

F45: 3
Title:  NSL
Year:  2022
Quarter:  1
Link:  https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170410
Published:  Fri, 13 May 2022 21:42:52 +0700

----------------------------------

F45: 4
Title:  A5
Year:  2022
Quarter:  1
Link:  https://classic.set.or.th/

In [5]:
df = pd.DataFrame(f45_items)
df[['name','year','quarter','published']]

Unnamed: 0,name,year,quarter,published
0,FPI,2022,1,"Fri, 13 May 2022 22:12:53 +0700"
1,SEAOIL,2022,1,"Fri, 13 May 2022 21:55:03 +0700"
2,SENA,2022,1,"Fri, 13 May 2022 21:48:56 +0700"
3,NSL,2022,1,"Fri, 13 May 2022 21:42:52 +0700"
4,A5,2022,1,"Fri, 13 May 2022 21:35:45 +0700"
5,SENA,2022,1,"Fri, 13 May 2022 21:35:32 +0700"
6,CHO,2022,1,"Fri, 13 May 2022 21:33:58 +0700"
7,TRC,2022,1,"Fri, 13 May 2022 21:30:43 +0700"
8,PSH,2022,1,"Fri, 13 May 2022 21:25:02 +0700"
9,LEO,2022,1,"Fri, 13 May 2022 21:19:06 +0700"


In [6]:
df.dtypes

name         object
year          int64
quarter      object
link         object
published    object
dtype: object

In [37]:
df.loc[(df.quarter == 'S') ,'quarter'] = '1'
df.groupby(['year','quarter']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,link,published
year,quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022,1,212,212,212
2022,2,4,4,4
2022,3,1,1,1


In [7]:
df.quarter = df.quarter.astype(int)
df.groupby(['year','quarter']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,link,published
year,quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022,1,234,234,234
2022,2,5,5,5
2022,3,1,1,1


In [8]:
df.shape

(240, 5)

In [9]:
### First equals to latest published pdf file
df = df.drop_duplicates(subset='name', keep='first')
df.shape

(220, 5)

In [10]:
file_name = 'F45-RAW-' + mmdd_str + '.csv'
raw_file = raw_path + file_name
output_file = csv_path + file_name
box_file = box_path + file_name
one_file = one_path + file_name

df[['name','year','quarter','published']].to_csv(output_file, header=True, index=False, sep=',')
df[['name','year','quarter','published']].to_csv(box_file,    header=True, index=False, sep=',')
df[['name','year','quarter','published']].to_csv(one_file,    header=True, index=False, sep=',')
df[['name','year','quarter','published']].to_csv(raw_file,    header=True, index=False, sep=',')

In [11]:
sql = '''
SELECT *
FROM exempts
ORDER BY name'''
df_exempts = pd.read_sql(sql, conlt)
df_exempts.shape[0]

405

In [12]:
df_merge = pd.merge(df, df_exempts, on='name', how='outer', indicator=True)
df_merge.shape

(520, 7)

### Tickers that won't be input

In [13]:
in_exempts = df_merge.loc[
    df_merge['_merge'] == 'both',
    ['name','year','quarter','published','link']
    
]
in_exempts.year = in_exempts.year.astype(int)
in_exempts.quarter = in_exempts.quarter.astype(int)
in_exempts.sort_values(by=['published'],ascending=[False])

Unnamed: 0,name,year,quarter,published,link
0,FPI,2022,1,"Fri, 13 May 2022 22:12:53 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982171260
1,SEAOIL,2022,1,"Fri, 13 May 2022 21:55:03 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170940
5,CHO,2022,1,"Fri, 13 May 2022 21:33:58 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170100
6,TRC,2022,1,"Fri, 13 May 2022 21:30:43 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982169950
9,AU,2022,1,"Fri, 13 May 2022 21:18:56 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982169170
10,META,2022,1,"Fri, 13 May 2022 21:18:27 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982169110
15,ERW,2022,1,"Fri, 13 May 2022 20:41:23 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982166230
20,BJCHI,2022,1,"Fri, 13 May 2022 20:30:28 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982165550
21,TVD,2022,1,"Fri, 13 May 2022 20:25:21 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982165110
24,CGD,2022,1,"Fri, 13 May 2022 20:03:25 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982163560


In [14]:
in_exempts.sort_values(by=['published'],ascending=[False]).shape[0]

105

In [15]:
df_out = df_merge.loc[
    df_merge['_merge'] == 'left_only',
    ['name','year','quarter','published','link']
]
df_out.year = df_out.year.astype(int)
df_out.quarter = df_out.quarter.astype(int)
df_out.sort_values(by=['published'],ascending=[False])

Unnamed: 0,name,year,quarter,published,link
2,SENA,2022,1,"Fri, 13 May 2022 21:48:56 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170660
3,NSL,2022,1,"Fri, 13 May 2022 21:42:52 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170410
4,A5,2022,1,"Fri, 13 May 2022 21:35:45 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170210
7,PSH,2022,1,"Fri, 13 May 2022 21:25:02 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982169770
8,LEO,2022,1,"Fri, 13 May 2022 21:19:06 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982169190
11,HYDRO,2022,1,"Fri, 13 May 2022 21:12:56 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982168390
12,MORE,2022,1,"Fri, 13 May 2022 21:03:16 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982167870
13,BLA,2022,1,"Fri, 13 May 2022 20:48:47 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982166680
14,PTG,2022,1,"Fri, 13 May 2022 20:44:29 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982166490
16,APP,2022,1,"Fri, 13 May 2022 20:39:09 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982166060


In [16]:
#df_out = df_out.drop(df_out.index[df_out['name'] == "SCC"])
df_out.shape[0]

115

In [17]:
sql = '''
SELECT *
FROM tickers
ORDER BY name'''
df_tickers = pd.read_sql(sql, conlt)
df_tickers.shape

(403, 9)

In [18]:
df_merge2 = pd.merge(df_out, df_tickers, on='name', how='outer', indicator=True)
df_merge2.shape

(467, 14)

### There are no ticker records

In [19]:
df_merge2.loc[
    df_merge2['_merge'] == 'left_only',
    ['name','year','quarter','published','link']
]

Unnamed: 0,name,year,quarter,published,link
1,NSL,2022.0,1.0,"Fri, 13 May 2022 21:42:52 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170410
2,A5,2022.0,1.0,"Fri, 13 May 2022 21:35:45 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170210
4,LEO,2022.0,1.0,"Fri, 13 May 2022 21:19:06 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982169190
5,HYDRO,2022.0,1.0,"Fri, 13 May 2022 21:12:56 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982168390
6,MORE,2022.0,1.0,"Fri, 13 May 2022 21:03:16 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982167870
9,APP,2022.0,1.0,"Fri, 13 May 2022 20:39:09 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982166060
10,SMD,2022.0,1.0,"Fri, 13 May 2022 20:38:21 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982166000
12,AIMCG,2022.0,1.0,"Fri, 13 May 2022 20:32:10 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982165730
14,TNPF,2022.0,1.0,"Fri, 13 May 2022 20:13:46 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982164310
16,JDF,2022.0,1.0,"Fri, 13 May 2022 19:57:27 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982163170


In [20]:
df_merge2.loc[
    df_merge2['_merge'] == 'left_only',
    ['name','year','quarter','published','link','id','market']
].shape

(64, 7)

### There are ticker records

In [21]:
df_out2 = df_merge2.loc[
    df_merge2['_merge'] == 'both',
    ['name','year','quarter','published','link','id','market']
]
df_out2

Unnamed: 0,name,year,quarter,published,link,id,market
0,SENA,2022.0,1.0,"Fri, 13 May 2022 21:48:56 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170660,437.0,sSET
3,PSH,2022.0,1.0,"Fri, 13 May 2022 21:25:02 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982169770,377.0,SETTHSI
7,BLA,2022.0,1.0,"Fri, 13 May 2022 20:48:47 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982166680,70.0,SET100 / SETTHSI
8,PTG,2022.0,1.0,"Fri, 13 May 2022 20:44:29 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982166490,381.0,SET100 / SETHD / SETTHSI
11,AIMIRT,2022.0,1.0,"Fri, 13 May 2022 20:36:25 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982165880,669.0,SET
13,JMART,2022.0,1.0,"Fri, 13 May 2022 20:19:34 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982164640,236.0,SET100
15,S11,2022.0,1.0,"Fri, 13 May 2022 19:59:06 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982163230,412.0,sSET
17,POPF,2022.0,1.0,"Fri, 13 May 2022 19:56:41 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982163040,364.0,SET
18,VIBHA,2022.0,1.0,"Fri, 13 May 2022 19:42:27 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982161920,610.0,SETWB
22,EA,2022.0,1.0,"Fri, 13 May 2022 19:22:49 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982159440,148.0,SET50 / SETTHSI


In [22]:
df_out2 = df_out2[df_out2.year.notnull()]
df_out2.shape

(51, 7)

In [23]:
df_out2['year'] = df_out2['year'].astype(int)
df_out2['quarter'] = df_out2['quarter'].astype(int)
df_out2.shape

(51, 7)

In [24]:
file_name = 'F45-CLEAN-' + mmdd_str + '.csv'
clean_file = clean_path + file_name
output_file = csv_path + file_name
box_file = box_path + file_name
one_file = one_path + file_name

df_out2[['name','year','quarter','published','market']].sort_values(['published'],ascending=[False]).to_csv(output_file, header=True, index=False, sep=',')
df_out2[['name','year','quarter','published','market']].sort_values(['published'],ascending=[False]).to_csv(clean_file, header=True, index=False, sep=',')
df_out2[['name','year','quarter','published','market']].sort_values(['published'],ascending=[False]).to_csv(box_file, header=True, index=False, sep=',')
df_out2[['name','year','quarter','published','market']].sort_values(['published'],ascending=[False]).to_csv(one_file, header=True, index=False, sep=',')

In [25]:
sql = '''
SELECT * 
FROM epss
WHERE year = 2022'''
df_epss = pd.read_sql(sql, conlt)
df_epss.shape

(165, 14)

In [26]:
df_merge3 = pd.merge(df_out2, df_epss, on=['name','year','quarter'], how='outer', indicator=True)
df_merge3.shape

(199, 19)

### Already input, display profit amt & eps to check with new F45

In [27]:
df_merge3[df_merge3['_merge'] == 'both']

Unnamed: 0,name,year,quarter,published,link,id_x,market,id_y,q_amt,y_amt,aq_amt,ay_amt,q_eps,y_eps,aq_eps,ay_eps,ticker_id,publish_date,_merge
0,SENA,2022,1,"Fri, 13 May 2022 21:48:56 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982170660,437.0,sSET,21577.0,336344.0,229318.0,336344.0,229318.0,0.234,0.1607,0.234,0.1607,437.0,2022-05-13,both
10,KTB,2022,1,"Fri, 13 May 2022 18:57:57 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982154040,258.0,SET50 / SETHD / SETTHSI,21423.0,8780344.0,5578436.0,8780344.0,5578436.0,0.63,0.4,0.63,0.4,258.0,2022-04-21,both
13,TPIPL,2022,1,"Fri, 13 May 2022 18:27:06 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982147790,559.0,SET,21578.0,1800127.0,1164481.0,1800127.0,1164481.0,0.095,0.061,0.095,0.061,559.0,2022-05-13,both
15,SCB,2022,1,"Fri, 13 May 2022 18:21:54 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982146710,426.0,SET50 / SETCLMV / SETTHSI,21425.0,10192863.0,10087799.0,10192863.0,10087799.0,3.0,2.97,3.0,2.97,426.0,2022-04-21,both
21,DIF,2022,1,"Fri, 13 May 2022 17:55:58 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982138410,140.0,SET,21575.0,2894820.0,2957126.0,2894820.0,2957126.0,0.2723,0.2781,0.2723,0.2781,140.0,2022-05-13,both
22,BCH,2022,1,"Fri, 13 May 2022 17:50:03 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982136540,51.0,SET100 / SETWB,21574.0,2028333.0,323774.0,2028333.0,323774.0,0.81,0.13,0.81,0.13,51.0,2022-05-13,both
23,KTC,2022,1,"Fri, 13 May 2022 17:46:42 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982135770,259.0,SET50 / SETTHSI,21420.0,1747272.0,1633806.0,1747272.0,1633806.0,0.68,0.63,0.68,0.63,259.0,2022-04-20,both
31,HREIT,2022,1,"Fri, 13 May 2022 17:16:34 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982123460,211.0,SET,21576.0,136025.0,250168.0,136025.0,250168.0,0.1619,0.2978,0.1619,0.2978,211.0,2022-05-13,both
34,TISCO,2022,1,"Fri, 13 May 2022 17:01:53 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982114040,531.0,SET50 / SETHD / SETTHSI,21415.0,1795491.0,1763627.0,1795491.0,1763627.0,2.24,2.2,2.24,2.2,531.0,2022-04-18,both
36,JMT,2022,1,"Fri, 13 May 2022 13:22:55 +0700",https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982074050,237.0,SET100,21570.0,366955.0,282826.0,366955.0,282826.0,0.27,0.28,0.27,0.28,237.0,2022-05-13,both


In [28]:
df_merge3[df_merge3['_merge'] == 'both'].shape

(17, 19)

In [29]:
df_inp2 = df_merge3[df_merge3['_merge'] == 'left_only']
df_inp3 = df_inp2.copy()
df_inp3.shape

(34, 19)

In [30]:
df_inp3['year'] = df_inp3['year'].astype(str)
df_inp3['quarter'] = df_inp3['quarter'].astype(str)
final = df_inp3.sort_values('name',ascending=True)
final_str = final.name+' '+final.year+' '+final.quarter+' '+final.link
final_str

4     AIMIRT 2022 1 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982165880
26    ALUCON 2022 1 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982127910
14     AMATA 2022 1 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982146840
32       ASP 2022 2 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982116920
33        BA 2022 1 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982115270
46    BEAUTY 2022 1 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982040010
28       BEC 2022 1 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982126480
48     BGRIM 2022 1 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982037200
35       BKI 2022 1 https://classic.set.or.th/set/newsdetails.do?language=en&country=US&newsId=16523982114020
2        B