In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.preprocessing import LabelEncoder # for encoding
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #for standardization

In [None]:
smoking = pd.read_csv('./data/Smoking_raw/smoking.csv')

In [None]:
smoking.head()

In [None]:
df = smoking.drop("ID", axis = 1)
df.head(n = 10).style.background_gradient(cmap = "Oranges")

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.corr()

In [None]:
# 기초 신체 사항과 흡연의 상관관계
cat1 = df.loc[:,'gender':'hearing(right)']
cat1['smoking'] = df['smoking']
cat1.corr()

In [None]:
# 심혈관 및 호흡의 상관관계
cat2 = df.loc[:,'systolic':'hemoglobin']
cat2['smoking'] = df['smoking']
cat2.corr()

In [None]:
# 신장질환과 흡연의 상관관계
cat3 = df.loc[:,'Urine protein':'serum creatinine']
cat3['smoking'] = df['smoking']
cat3.corr()

In [None]:
# 간수치와 흡연의 상관관계
cat4 = df.loc[:,'AST':'Gtp']
cat4['smoking'] = df['smoking']
cat4.corr()

In [None]:
# 구강건강와 흡연의 상관관계
cat4 = df.loc[:,'oral':'tartar']
cat4['smoking'] = df['smoking']
cat4.corr()

In [None]:
 def heatmap(cat1):
    plt.figure(figsize = [20, 10], clear = True, facecolor = "white")
    sns.heatmap(cat1.corr(), annot = True, square = False, linewidths = 3,
                linecolor = "white", cmap = "Set2");
    plt.style.use('seaborn')

In [None]:
heatmap(cat2)

In [None]:
heatmap(cat3)

1. 범주형 변수 변환
2. 피쳐 스케일링
    - StandatdScaler : 서포트 벡터 머신, 로지스틱 회귀, 선형회귀는 가우시안 분포를 가정하고 구현됨, 정규화가 필요
        - df['Urine protein'].value_counts() 보류(# ulfosalicylic acid precipitation method -> 소변의 탁도와 단백질 함유량 검사 , 6단계로 구분함.)
    - MinMaxScaler : 데이터 분포가 가우시안 분포가 아닐 경우
3. 이상치 검출 및 삭제 - 논의 후 결정
    

In [None]:
df = df.drop('oral',axis=1)

In [None]:
# 범주형 변수 변환 
# Oral 피쳐는 모두 Y 값이므로 삭제.

df.info()

In [None]:
cate_features = df[['gender','tartar']]

In [None]:
lbe = LabelEncoder()
lbe.fit_transform(df["gender"])
df["gender"] = lbe.fit_transform(df["gender"])

In [None]:
df

In [None]:
lbe = LabelEncoder()
lbe.fit_transform(df["tartar"])
df["tartar"] = lbe.fit_transform(df["tartar"])


In [None]:
# 레이블 인코딩 완료
df

In [None]:
# 정규화 작업
plt.hist(df['gender'])

In [None]:
df.hist(figsize = (20, 20), bins = 12, legend = False)

In [None]:
df['hearing(right)'].value_counts()

In [None]:
cate_features = df[['gender', 'smoking', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]

In [None]:
cate_features.columns

In [None]:
scaled_features = df.drop(cate_features.columns, axis=1)
scaled_features

In [None]:
# ulfosalicylic acid precipitation method -> 소변의 탁도와 단백질 함유량 검사, 6단계로 구분함.
df['Urine protein'].value_counts()

In [None]:
scaled_features.hist(figsize = (20, 20), bins = 12, legend = False)

In [None]:
scaler = StandardScaler()
scaler.fit(scaled_features)
scaled = scaler.transform(scaled_features)

In [None]:
scaled_features.columns

In [None]:
test= pd.DataFrame(scaled,columns = scaled_features.columns)
test

In [None]:
test.hist(figsize = (20, 20), bins = 12, legend=False)

In [None]:
# # 이상치 검출 및 삭제 
# def outlier_detection(df, n, columns):
#     rows = []
#     will_drop_train = []
#     for col in columns:
#         Q1 = np.nanpercentile(df[col], 25)
#         Q3 = np.nanpercentile(df[col], 75)
#         IQR = Q3 - Q1
#         outlier_point = 1.5 * IQR
#         rows.extend(df[(df[col] < Q1 - outlier_point)|(df[col] > Q3 + outlier_point)].index)
#     for r, c in Counter(rows).items():
#         if c >= n: will_drop_train.append(r)
#     return will_drop_train

# will_drop_train = outlier_detection(df, 5, df.select_dtypes(["float", "int"]).columns)
# will_drop_train[0:5]

# df.drop(will_drop_train, inplace = True, axis = 0)