# Load Library

In [1]:
import xml.etree.ElementTree as ET
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from scipy.stats import pearsonr
from sklearn import preprocessing

import zipfile
from glob import glob

import seaborn as sns
from sympy import subsets
sns.set()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

#import japanize_matplotlib
import unicodedata


## Set Parameter

In [2]:
data_dir = u"L:\\Miyuki"
output_dir = u"L:\\epson_pxm840f\\Shikakku"

In [3]:
filename = os.path.join(data_dir,"NIKKEI.csv")
df = pd.read_csv(filename,encoding='cp932')

In [4]:
display(df.head(5))

Unnamed: 0,調剤日,時間,患者No,会員NO,患者名,生年月日,保険種類,保険者番号,被保険者記号,被保険者番号,...,薬歴管理指導料(特例 3月以内 手帳有)（再掲）（点）,薬歴管理指導料(特例 3月以内処方持参以外)（再掲）（点）,薬歴管理指導料(オンライン)（再掲）（点）,調剤感染症対策実施加算（点）,居宅療養（介護）（オンライン）（点）,予防居宅療養（介護）（オンライン）（点）,居宅療養感染症特例加算（点）,在宅患者緊急訪問薬剤管理指導料（電話等又は家族等・臨コ）（点）,在宅患者緊急訪問薬剤管理指導料（対面による場合・臨コ）（点）,服薬情報等提供料１（臨コ）（点）
0,2022/02/01,1008,90007919,,前田　憲正,1942/07/25,公単,,,,...,0,0,0,0,0,0,0,0,0,0
1,2022/02/01,1058,90008892,,田村　正人,1949/11/14,公単,,,,...,0,0,0,0,0,0,0,0,0,0
2,2022/02/01,1058,90008345,,中務　勝史,1945/03/08,公単,,,,...,0,0,0,0,0,0,0,0,0,0
3,2022/02/01,1154,90009391,,中島　近夫,1964/06/14,社保本人,1130012.0,２６０１０８３８,２１７,...,0,0,0,0,0,0,0,0,0,0
4,2022/02/01,1629,90009392,,大嶋　孝聡,1984/01/12,社保本人,6139166.0,４５７,１１１５,...,0,0,0,0,0,0,0,0,0,0


## Selected Data

In [5]:
targetCols = ['調剤日',
 '患者No',
 '患者名',
 '生年月日',
 '保険種類',
 '保険者番号',
 '被保険者記号',
 '被保険者番号',
 '公費負担者番号１',
 '公費受給番号１',
 '公費負担者番号２',
 '公費受給番号２',
 '給付割合',
 '医療機関',
 '受付回数',
 '処方箋枚数',
 '保険合計(点)',
 '患者負担金(円)']

In [6]:

df[targetCols].shape
df= df[targetCols].copy()

In [7]:
newcolname=['chozai','id','name','birth','hoken','hokenNo','kigo','bango','kouhi1','kouhi_jyu1','kouhi2','kouhi_jyu2',
            'kyufu','institution','accept_counts','prescript_counts','total','paid']

df.columns = newcolname

In [8]:
display(df.head(3))

Unnamed: 0,chozai,id,name,birth,hoken,hokenNo,kigo,bango,kouhi1,kouhi_jyu1,kouhi2,kouhi_jyu2,kyufu,institution,accept_counts,prescript_counts,total,paid
0,2022/02/01,90007919,前田　憲正,1942/07/25,公単,,,,12133526.0,6006316.0,,,0,田幡医院,1,1,1582,0
1,2022/02/01,90008892,田村　正人,1949/11/14,公単,,,,12133633.0,7494214.0,,,0,東武練馬ｸﾘﾆｯｸ,1,1,291,0
2,2022/02/01,90008345,中務　勝史,1945/03/08,公単,,,,12133526.0,6006910.0,,,0,東武練馬ｸﾘﾆｯｸ,1,1,189,0


In [18]:
corp_ins = df.hoken.apply(lambda x:x[:2]).isin( ["社保"])

In [19]:
corp_ins

0      False
1      False
2      False
3       True
4       True
       ...  
511    False
512    False
513    False
514    False
515    False
Name: hoken, Length: 516, dtype: bool

## Convert str and fill 0 with blank

In [8]:
df['kouhi1'] = df['kouhi1'].fillna(0).apply(int).apply(str).replace(['0'],' ')
df['kouhi_jyu1'] = df['kouhi_jyu2'].fillna(0).apply(int).apply(str).replace(['0'],' ')
df['kouhi2'] = df['kouhi2'].fillna(0).apply(int).apply(str).replace(['0'],' ')
df['kouhi_jyu2'] = df['kouhi_jyu2'].fillna(0).apply(int).apply(str).replace(['0'],' ')
df['hokenNo'] = df['hokenNo'].fillna(0).apply(int).apply(str).replace(['0'],' ')


In [9]:

df['kigo'] = [unicodedata.normalize("NFKC",str(z)) for z in df['kigo'].fillna("").apply(str)]
df['bango'] = [unicodedata.normalize("NFKC",str(z)) for z in df['bango'].fillna("").apply(str)]


In [15]:
#NIKKEI = os.path.join(output_dir,"hoken_NIKKEI.csv")
#df.to_csv(NIKKEI, encoding='cp932', errors='replace', index=False)


0       True
1       True
2       True
3      False
4      False
       ...  
511    False
512    False
513    False
514    False
515    False
Name: kouhi1, Length: 516, dtype: bool

# Merged

In [12]:
filename = os.path.join(output_dir,"merged.csv")
df = pd.read_csv(filename,encoding='cp932')

In [16]:
df.dtypes

chozai                    object
id                         int64
name_x                    object
birth                     object
hoken_x                   object
hokenNo                   object
kigo_x                    object
bango_x                  float64
kouhi1                    object
kouhi_jyu1                object
kouhi2                    object
kouhi_jyu2                object
kyufu                      int64
institution               object
conf_date                float64
segment                   object
facial                   float64
validation                object
hoken_y                  float64
hoken_name                object
kigo_y                   float64
bango_y                  float64
eda                      float64
hoken_segment             object
yours                     object
name_y                    object
kana                     float64
sex1                      object
address                   object
postal                    object
hoken_issu