# 논문 네트워크 시각화 프로젝트 전처리 디벨롭

### 문제점
#### 1. mongodb 와 spark 연동 실패 -> spark dataframe이 아닌 pandas dataframe으로 불러와서 변환하는 추가작업이 필요했음

### 1. Mongodb -> Spark 연동 실패

##### 문제점 1. 사용자 계정 및 비밀번호 노출
##### 문제점 2. bson -> pandas dataframe -> spark DataFrame 변환하는 방식으로 진행하여 효율성 감소
##### 문제점 3. 문제점 2의 형식으로 진행시 -> 리스트 형태의 값들이 전부 스트링 형태로 저장되는 문제 발생

In [136]:
import pymongo

client = MongoClient("mongodb://root:1234@mongodb:27017/admin")
db_names = client.list_database_names()
print("Available databases:", db_names)
db = client['kci_trained_api']

collection_names = db.list_collection_names()
print("Available collection_names:", collection_names)

Available databases: ['Egg_', 'admin', 'category', 'config', 'kci_AuGraph', 'kci_api', 'kci_author_info', 'kci_ccGraph', 'kci_trained_api', 'local', 'mydb', 'reference_map']
Available collection_names: ['kci_trained_202309', 'kci_trained_202309_dev', 'kci_trained_202310']


In [138]:
def get_kci_data(df_name):
    print('Get',df_name,"!")
    kci_db_name = "kci_trained_api"
    kci_db = client[kci_db_name]
    kci_data = list(kci_db[df_name].find({}))
    
    pandas_df = pd.DataFrame(kci_data)
    pandas_df = pandas_df[['articleID','titleEng','abstractEng','journalID','pubYear','refereceTitle']]
    pandas_df = pandas_df.astype(str)

    spark_df = spark.createDataFrame(pandas_df)
#    spark_df = spark_df.withColumn("keywords", parse_lists_udf(spark_df['refereceTitle']))

    return spark_df

df_name = "kci_trained_202309"
temp = get_kci_data(df_name)
temp.printSchema()

Get kci_trained_202309 !


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


root
 |-- articleID: string (nullable = true)
 |-- titleEng: string (nullable = true)
 |-- abstractEng: string (nullable = true)
 |-- journalID: string (nullable = true)
 |-- pubYear: string (nullable = true)
 |-- refereceTitle: string (nullable = true)



### 문제에 대한 해결

##### 민감정보 노출 -> config.py를 따로 구성하여 코드 상에서 나타지 않게 함.
##### jars.packages 를 사용하여 연동.

In [139]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
import config

In [140]:
spark = SparkSession.builder\
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0")\
    .appName("kci").getOrCreate()

In [144]:
def parse_list(value):
    return ast.literal_eval(value) if value else []
    
kci_api_uri = f"mongodb://{config.mongo_user}:{config.mongo_pass}@mongodb:27017/kci_trained_api.kci_trained_202309?authSource=admin"

schema = StructType([
    StructField("articleID", StringType(), True),
    StructField("titleEng", StringType(), True),
    StructField("abstractEng", StringType(), True),
    StructField("journalID", StringType(), True),
    StructField("pubYear", StringType(), True),
    StructField("refereceTitle", StringType(), True),
    StructField("class", StringType(), True)

])

df = spark.read.format("mongo") \
    .option("uri", kci_api_uri2) \
    .schema(schema)\
    .load()

# 문자열 -> 리스트 변환
parse_list_udf = udf(parse_list, ArrayType(StringType()))
df = df.withColumn("refereceTitle", parse_list_udf(df["refereceTitle"]))

df.printSchema()
df.show()

root
 |-- articleID: string (nullable = true)
 |-- titleEng: string (nullable = true)
 |-- abstractEng: string (nullable = true)
 |-- journalID: string (nullable = true)
 |-- pubYear: string (nullable = true)
 |-- refereceTitle: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- class: string (nullable = true)

+------------+--------------------+--------------------+---------+-------+--------------------+--------------------+
|   articleID|            titleEng|         abstractEng|journalID|pubYear|       refereceTitle|               class|
+------------+--------------------+--------------------+---------+-------+--------------------+--------------------+
|ART001652135|New Online Paymen...|Due to technologi...|   A00398|   2012|[Canada, Australi...|            Security|
|ART001652137|Quantitative Anal...|The performance o...|   A00398|   2012|[3D Game Engine D...|            Hardware|
|ART001652144|An Efficient Cont...|With the developm...|   A00398|   2012|[E

##### 실제 리스트 값의 유지 확인을 위한 저장

In [122]:
save_uri = f"mongodb://{config.mongo_user}:{config.mongo_pass}@mongodb:27017/kci_trained_api.kci_trained_202309_dev?authSource=admin"

df.write.format("mongo") \
    .option("uri", save_uri) \
    .mode("append") \
    .save()

In [159]:
kci_api_uri_dev = f"mongodb://{config.mongo_user}:{config.mongo_pass}@mongodb:27017/kci_trained_api.kci_trained_202309_dev?authSource=admin"

# schema = StructType([
#     StructField("articleID", StringType(), True),
#     StructField("titleEng", StringType(), True),
#     StructField("abstractEng", StringType(), True),
#     StructField("journalID", StringType(), True),
#     StructField("pubYear", StringType(), True),
#     StructField("refereceTitle", ArrayType(StringType()), True),
#     StructField("class", StringType(), True)
# ])

df_dev = spark.read.format("mongo") \
    .option("uri", kci_api_uri_dev) \
    .load()

df_dev.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- abstractEng: string (nullable = true)
 |-- articleID: string (nullable = true)
 |-- class: string (nullable = true)
 |-- journalID: string (nullable = true)
 |-- pubYear: string (nullable = true)
 |-- refereceTitle: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- titleEng: string (nullable = true)

