In [2]:
import pandas as pd
import boto3, pymysql
from collections import Counter
from konlpy.tag import Okt

### 환경변수 불러오기
from config import *

In [3]:
def mysql_conn(HOST, USER, PASSWORD, DB, SQL, CHARSET="utf8"):
    conn = pymysql.connect(
        host=HOST, user=USER, password=PASSWORD, db=DB, charset=CHARSET
    )
    try:
        with conn.cursor() as cur:
            cur.execute(SQL)
            result = cur.fetchall()
            return result
    except:
        print("Not connected")

    finally:
        cur.close()
        conn.close()


def s3_conn(
    AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION, BUCKET_NAME, Key
):
    s3_client = boto3.client(
        "s3",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        region_name=AWS_DEFAULT_REGION,
    )
    contents = []

    for _ in Key:
        if _.startswith("news"):
            content = (
                s3_client.get_object(Bucket=BUCKET_NAME, Key=_)["Body"].read().decode()
            )
            contents.append(content)
        else:
            contents.append("None")
    return contents


def wc(data, num=100):
    json_data = []
    okt = Okt()
    noun = okt.nouns(data)
    counter = Counter(noun)
    top_num_noun = counter.most_common(num)
    for x, y in top_num_noun:
        json_data.append({"tag": x, "count": y})
    return json_data

In [4]:
columns = ["dailydate", "ticker", "name", "news_summary"]
SQL = f"select {', '.join(columns)} from daily_update"
result = mysql_conn(HOST, USER, PASSWORD, DB, SQL=SQL)
df = pd.DataFrame(result, columns=columns)
contents = s3_conn(
    AWS_ACCESS_KEY_ID,
    AWS_SECRET_ACCESS_KEY,
    AWS_DEFAULT_REGION,
    BUCKET_NAME,
    df["news_summary"],
)
df["contents"] = contents
json_data = wc(df["contents"].sum())

In [24]:
data = df["contents"].sum()

okt = Okt()
noun = okt.nouns(data)
counter = Counter(noun)
top_num_noun = counter.most_common(50)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
text = df["contents"][df["contents"]!="None"].apply(lambda x: x+"tf-idf").sum().split("tf-idf")[:-1]

In [67]:
tfidf_vectorizer = TfidfVectorizer() # TF-IDF 객체 선언

In [72]:
tfidf_vectorizer.fit(text)
sorted(tfidf_vectorizer.vocabulary_.items())

[('00', 0),
 ('000개의', 1),
 ('035', 2),
 ('073주를', 3),
 ('10', 4),
 ('100', 5),
 ('1048억9천만', 6),
 ('10b5', 7),
 ('10년', 8),
 ('10년간', 9),
 ('10만', 10),
 ('10억', 11),
 ('11명의', 12),
 ('12년', 13),
 ('13', 14),
 ('13일', 15),
 ('14년', 16),
 ('15', 17),
 ('15년', 18),
 ('15명을', 19),
 ('18명', 20),
 ('18세', 21),
 ('194', 22),
 ('1년간', 23),
 ('1분기', 24),
 ('1에', 25),
 ('1자리', 26),
 ('20', 27),
 ('2006년', 28),
 ('2018년', 29),
 ('2018년에', 30),
 ('2024에서', 31),
 ('2025년', 32),
 ('20억', 33),
 ('220달러로', 34),
 ('22일', 35),
 ('26명의', 36),
 ('27만', 37),
 ('29', 38),
 ('2년', 39),
 ('2만', 40),
 ('2일', 41),
 ('2조', 42),
 ('2주로', 43),
 ('307', 44),
 ('30만', 45),
 ('31', 46),
 ('310만', 47),
 ('329만', 48),
 ('356', 49),
 ('36', 50),
 ('376주', 51),
 ('3거래일', 52),
 ('3월의', 53),
 ('40', 54),
 ('400에서', 55),
 ('412주', 56),
 ('41배의', 57),
 ('431주로', 58),
 ('433억', 59),
 ('44', 60),
 ('448달러에', 61),
 ('44억', 62),
 ('472', 63),
 ('495달러로', 64),
 ('4만', 65),
 ('4배', 66),
 ('4월에', 67),
 ('4천만', 68),
 ('50', 69),
 (

In [111]:
sorted(tfidf_vectorizer.vocabulary_.items())

[('00', 0),
 ('000개의', 1),
 ('035', 2),
 ('073주를', 3),
 ('10', 4),
 ('100', 5),
 ('1048억9천만', 6),
 ('10b5', 7),
 ('10년', 8),
 ('10년간', 9),
 ('10만', 10),
 ('10억', 11),
 ('11명의', 12),
 ('12년', 13),
 ('13', 14),
 ('13일', 15),
 ('14년', 16),
 ('15', 17),
 ('15년', 18),
 ('15명을', 19),
 ('18명', 20),
 ('18세', 21),
 ('194', 22),
 ('1년간', 23),
 ('1분기', 24),
 ('1에', 25),
 ('1자리', 26),
 ('20', 27),
 ('2006년', 28),
 ('2018년', 29),
 ('2018년에', 30),
 ('2024에서', 31),
 ('2025년', 32),
 ('20억', 33),
 ('220달러로', 34),
 ('22일', 35),
 ('26명의', 36),
 ('27만', 37),
 ('29', 38),
 ('2년', 39),
 ('2만', 40),
 ('2일', 41),
 ('2조', 42),
 ('2주로', 43),
 ('307', 44),
 ('30만', 45),
 ('31', 46),
 ('310만', 47),
 ('329만', 48),
 ('356', 49),
 ('36', 50),
 ('376주', 51),
 ('3거래일', 52),
 ('3월의', 53),
 ('40', 54),
 ('400에서', 55),
 ('412주', 56),
 ('41배의', 57),
 ('431주로', 58),
 ('433억', 59),
 ('44', 60),
 ('448달러에', 61),
 ('44억', 62),
 ('472', 63),
 ('495달러로', 64),
 ('4만', 65),
 ('4배', 66),
 ('4월에', 67),
 ('4천만', 68),
 ('50', 69),
 (

In [108]:
dtm_name

array(['00', '000개의', '035', ..., '훈련하는', '희망하고', '힘입어'], dtype=object)

In [100]:
dtm_name = tfidf_vectorizer.get_feature_names_out()

<39x1526 sparse matrix of type '<class 'numpy.float64'>'
	with 2125 stored elements in Compressed Sparse Row format>

In [101]:
len(dtm_name)

1526

In [77]:
from scipy.sparse import csr_matrix

In [83]:
dtm_dense = tfidf_vectorizer.transform(text)

In [81]:
import networkx as nx
import numpy as np

In [85]:
edgelist = np.dot(dtm_dense.T, dtm_dense)

In [96]:
ngraph = nx.Graph(edgelist[:,:])