In [2]:
import pandas as pd
import numpy as np
import librosa
import os
import pywt
import sklearn 
import scipy.stats as sp
import random
random.seed(777)
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from tensorflow import keras
from sklearn.model_selection import KFold
import tensorflow as tf
import joblib

### 0. 데이터 불러오기

데이터 파일 경로 상황 

기계

-- data --

In [3]:
from google.colab import drive
drive.mount('/content/drive')
#데이터 path 설정
StdPath = os.path.join(os.getcwd(),os.path.join('drive',os.path.join('MyDrive',os.path.join('Colab Notebooks',os.path.join('데이콘','기계')))))

Mounted at /content/drive


In [4]:
Train_info = pd.read_csv(os.path.join(StdPath,'data/train.csv'),sep=',')
Test_info  = pd.read_csv(os.path.join(StdPath,'data/test.csv'),sep=',')

### 1. 데이터 특징 추출

1-1. 음성 길이 측정

In [None]:
Sec = []
for i in range(len(Train_info)):
  data, SR = librosa.load(os.path.join(StdPath,'data')+Train_info['SAMPLE_PATH'].iloc[i][1:], sr=16000)
  sec = data.shape[0]/float(SR)
  Sec.append(sec)
Sec_info = pd.DataFrame(Sec)
Sec_info.describe()

1-2. 데이터 특징 함수 

Time Domain 통계량은 15 = 15개

(Max,Min,Mean,Rms,Std,Skew,Kurt,CF,IF,SF,Median,Mode,Q1,Q3,Iqr)

Freq Domain = 10 wavelet levels * 15  =  150개

따라서 Feature 개수는 15+150 = 165개이다.

즉, DataFrame는 (1279,165)개가 된다.

In [None]:
def rms(x):
  return np.sqrt(np.mean(x**2))

In [None]:
def Feature_Make(data):
  Domain = pd.DataFrame()
  Max = np.max(data)
  Min = np.min(data)
  Mean = np.mean(data)
  Rms = rms(data)
  Std = np.std(data)
  Skew =sp.skew(data)
  Kurt = sp.kurtosis(data)
  if Rms == 0:
    Cf = 0
  else:
    Cf = Max/Rms
  if Mean==0:
    If = 0
  else:
    If = Rms/Mean
  if Mean  == 0:
    Sf = 0
  else:
    Sf = Max/Mean
  Median = np.median(data)
  Mode = sp.mode(data)[0][0]
  q1 = np.quantile(data,0.25)
  q3 = np.quantile(data,0.75)
  Iqr = q3-q1
  List = [Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr]
  Domain = pd.DataFrame(List,index=['Max','Min','Mean','Rms','Std','Skew','Kurt','Cf','If','Sf','Median','Mode','q1','q3','Iqr'])
  Domain = Domain.transpose()
  return Domain

In [None]:
def Frequency_Domain(data):
  Mom = pywt.Wavelet('haar')# 모함수 지정
  Coefficient = pywt.wavedec(data,'haar', level=10)[1:]
  Domain = pd.DataFrame()
  for Coef in Coefficient:
    Domain = pd.concat([Domain,Feature_Make(Coef)],axis = 1)
  return Domain 

In [None]:
def TimeAndFreq(Data):
  Time = Feature_Make(Data)
  Freq = Frequency_Domain(Data)
  feature = pd.concat([Time,Freq],axis=1)
  return feature

1-3. Train Feature 추출

In [None]:
Feature = pd.DataFrame()
for i in range(len(Train_info)):
  data, SR = librosa.load(os.path.join(StdPath,'data')+Train_info['SAMPLE_PATH'].iloc[i][1:], sr=16000)
  temp_feature=TimeAndFreq(data)
  Feature = pd.concat([Feature,temp_feature],axis=0) # 데이터 순서대로 쌓기  
Feature.columns=np.arange(Feature.shape[1])
Feature=Feature.reset_index(drop=True)
Train_Feature=Feature

In [None]:
Train_Feature.to_csv(os.path.join(StdPath,'Feature/train.csv'),sep=',',index=None,header=None)
Train_Feature.shape

1-4. Test Feature 추출

In [None]:
Feature = pd.DataFrame()
for i in range(len(Test_info)):
  data, SR = librosa.load(os.path.join(StdPath,'data')+Test_info['SAMPLE_PATH'].iloc[i][1:], sr=16000)
  temp_feature=TimeAndFreq(data)
  Feature = pd.concat([Feature,temp_feature],axis=0) # 데이터 순서대로 쌓기  
Feature.columns=np.arange(Feature.shape[1])
Feature=Feature.reset_index(drop=True)
Test_Feature=Feature

In [None]:
Test_Feature.to_csv(os.path.join(StdPath,'Feature/test.csv'),sep=',',index=None,header=None)
Test_Feature.shape

(1514, 165)

### 2. 데이터 전처리

In [5]:
### Feature data 불러오기 - 용량 부족으로 인한 과정
Train_Feature = pd.read_csv(os.path.join(StdPath,'Feature/train.csv'),sep=',',header=None)
Test_Feature = pd.read_csv(os.path.join(StdPath,'Feature/test.csv'),sep=',',header=None)
print(Train_Feature.shape,Test_Feature.shape)

(1279, 165) (1514, 165)


2-1. 데이터를 FAN 모델 종류에 따라서 분류하기

In [6]:
#Train데이터
Train_Feature_0 = Train_Feature.iloc[Train_info[Train_info['FAN_TYPE']==0].index,:] 
Train_Feature_2 = Train_Feature.iloc[Train_info[Train_info['FAN_TYPE']==2].index,:]
#test데이터
Test_Feature_0 = Test_Feature.iloc[Test_info[Test_info['FAN_TYPE']==0].index,:] 
Test_Feature_2 = Test_Feature.iloc[Test_info[Test_info['FAN_TYPE']==2].index,:]

2-2. Train 데이터 scaling

In [7]:
scaler_0 = MinMaxScaler()
scaler_2 = MinMaxScaler()
Train_Feature_0 = pd.DataFrame(scaler_0.fit_transform(Train_Feature_0)) 
Train_Feature_2 = pd.DataFrame(scaler_2.fit_transform(Train_Feature_2)) 

2-3.Test 데이터 scaling

In [8]:
Test_Feature_0 = pd.DataFrame(scaler_0.transform(Test_Feature_0)) 
Test_Feature_2 = pd.DataFrame(scaler_2.transform(Test_Feature_2)) 

### 3. 학습 진행

3-1. IsolationForest

In [9]:
N = 200
Max = 'auto'
Con = 70/639
Feature  = 1.0
model_0_IF = IsolationForest(n_estimators=N, max_samples=Max, contamination=Con, random_state=777, verbose=0,max_features=Feature)
model_0_IF.fit(Train_Feature_0)
model_2_IF = IsolationForest(n_estimators=N,max_samples=Max, contamination=Con, random_state=777, verbose=0,max_features=Feature)
model_2_IF.fit(Train_Feature_2)

IsolationForest(contamination=0.10954616588419405, n_estimators=200,
                random_state=777)

3-2. OneClassSVM

In [10]:
Nu = 0.005
Gam = 0.21
T=0.001
model_0_SVM = OneClassSVM(nu=Nu, kernel="rbf", gamma=Gam,tol=T)
model_0_SVM.fit(Train_Feature_0)
model_2_SVM = OneClassSVM(nu=Nu, kernel="rbf", gamma=Gam,tol=T)
model_2_SVM.fit(Train_Feature_2)

OneClassSVM(gamma=0.21, nu=0.005)

3-3. Local

In [29]:
N = 150
Con = 0.005
model_0_LOF = LocalOutlierFactor(contamination = Con,novelty=True,n_neighbors=N,)
model_0_LOF.fit(Train_Feature_0)
model_2_LOF = LocalOutlierFactor(contamination = Con,novelty=True,n_neighbors=N)
model_2_LOF.fit(Train_Feature_2)

LocalOutlierFactor(contamination=0.005, n_neighbors=150, novelty=True)

### 4. Predict

4-1. Labeling 함수 

In [30]:
# IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
def Labeling(data):
  Pred = pd.DataFrame(data)
  Pred = Pred.replace(1,0) # 정상 레이블
  Pred = Pred.replace(-1,1) # 불량 레이블
  Pred.columns = ['score']
  return Pred

4-2. 앙상블 함수 

In [31]:
def Ensemble(pred_1,pred_2,pred_3):
  Pred_1 = pred_1.to_numpy()
  Pred_2 = pred_2.to_numpy()
  Pred_3 = pred_3.to_numpy()
  Pred = (Pred_1+Pred_2+Pred_3)/3
  Pred = pd.DataFrame(Pred,columns=['score'])
  Pred.loc[Pred['score']>0.5]=1
  Pred.loc[Pred['score']<=0.5]=0
  return Pred

4-3. test 데이터 예측 진행

In [32]:
Pred_0_IF = Labeling(model_0_IF.predict(Test_Feature_0))
Pred_2_IF = Labeling(model_2_IF.predict(Test_Feature_2))
Pred_0_SVM = Labeling(model_0_SVM.predict(Test_Feature_0))
Pred_2_SVM = Labeling(model_2_SVM.predict(Test_Feature_2))
Pred_0_LOF = Labeling(model_0_SVM.predict(Test_Feature_0))
Pred_2_LOF = Labeling(model_2_SVM.predict(Test_Feature_2))
Pred_0 = Ensemble(Pred_0_IF,Pred_0_SVM,Pred_0_LOF)
Pred_2 = Ensemble(Pred_2_IF,Pred_2_SVM,Pred_2_LOF)

4-4. 제출 양식에 맞게 하기

In [33]:
def Make_submitFile(pred_0,pred_2):
  Final = pd.read_csv(os.path.join(StdPath,'data/sample_submission.csv'),sep=',')
  Label_0 = Test_info[Test_info.FAN_TYPE==0].index.tolist()
  Label_2 = Test_info[Test_info.FAN_TYPE==2].index.tolist()
  for i in range(len(Label_0)):
    Final.loc[[Label_0[i]],['LABEL']]=pred_0['score'].iloc[i]
  for i in range(len(Label_2)):
    Final.loc[[Label_2[i]],['LABEL']]=pred_2['score'].iloc[i]
  return Final

In [34]:
Final = Make_submitFile(Pred_0,Pred_2)
#Final.to_csv(os.path.join(StdPath,'final.csv'),sep=',',index=False)