In [4]:
import os
from datetime import datetime, date
from sklearn import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## TODO
* log: 어떤 전처리 과정이 일어났는지 기록을 볼 수 있는 log 인스턴스 추가
* 

In [5]:
class Preprocessor:
    def _fit_transform(self, raw):
        #initialize variables
        result = raw.copy()
        features_not_numeric = ['channel', 'title', 'genre','description','date', 'sign_in','is_upload']

        result = self._n_comment_to_float(result)
        result = self._str_to_datetype(result)
        result = self._add_is_upload(result)
        result = self._add_sub_diff(result)
        result = self._add_no_upload_interval(result)
        result = self._add_n_hashtag(result)
        
        features_numeric = result.drop(features_not_numeric, axis=1).columns.tolist()
        scales = self._get_min_max_values(result,features_numeric)
        result = self._scale(result,features_numeric)
        return result, scales
    
    
    #ADD OR CONVERTING FEATURES
    ##################################
    def _n_comment_to_float(self, result):
        '''n_comment 칼럼을 float type으로 변환하고, 댓글사용중지는 0으로 변환'''
        result['n_comment'] = result['n_comment'].loc[(result['n_comment'] == '댓글 사용 중지')].apply(lambda x: 0)
        result['n_comment'] = result['n_comment'].astype(float)
        return result
        
        
    def _str_to_datetype(self, result):
        '''csv파일 로드시 date 컬럼이 str 타입으로 읽혀진 경우 이를 datetype으로 변환'''
        if pd.api.types.is_datetime64_ns_dtype(result['date']):
            pass
        else:
            result['date'] = pd.to_datetime(result['date'])
        return result
    

    def _add_is_upload(self, result):
        '''해당 날짜에 영상 업로드가 발생했는지(1) 하지않았는지(0)를 담은 변수 생성'''
        result = result.groupby('channel').apply(lambda x: self._get_is_upload(x)).reset_index(drop=True)
        return result
    
    
    def _add_sub_diff(self, result):
        '''일간 구독자 변화량 컬럼을 추가하는 함수'''
        result = result.groupby('channel').apply(lambda x: self._get_sub_diff(x)).reset_index(drop=True)
        return result
    
    
    def _add_no_upload_interval(self, result):
        result = result.groupby('channel').apply(lambda x: self._get_no_upload_interval(x)).reset_index(drop=True)
        return result
    
    def _add_n_hashtag(self, result):
        '''영상별 해시태그 개수를 담은 변수 생성(영상 미업로드시 0)'''
        result['n_hashtage'] = 0
        idx = result['description'].notnull()
        result.loc[idx, 'n_hashtage'] = result.loc[idx, 'description'].apply(lambda x: len(x.split('#'))-1)
        return result
    
    @staticmethod
    def _get_is_upload(data):
        result = data.reset_index(drop=True)
        upload_idx = result[result['title'].notnull()].index.tolist()
        result['is_upload'] = 0
        result.loc[upload_idx, 'is_upload'] = 1
        return result
    
    @staticmethod
    def _get_sub_diff(data):
        result = data.reset_index(drop=True)
        result['sub_diff'] = (result['cumul_subs'] - result['cumul_subs'].shift())
        return result
    
    
    @staticmethod
    def _get_no_upload_interval(data):
        result = data.reset_index(drop=True)
        upload_idx = result[result['is_upload'] == 1].index.tolist()
        temp = [0 for i in range(result.shape[0])]
        for i in range(len(upload_idx)):
            if i == len(upload_idx)-1:
                former = upload_idx[i]
                temp[former+1:] = [i+1 for i in range(len(temp[former+1:]))]
            else:
                former, latter = upload_idx[i], upload_idx[i+1]
                temp[former+1:latter] = [i+1 for i in range(len(temp[former+1:latter]))]
        result['no_upload_interval'] = temp
        return result
    
    def _get_min_max_values(self,result,features_numeric):
        '''Saving min and max values prior to scaling'''
        m = (result[features_numeric]).min()
        M = (result[features_numeric]).max()
        Range = pd.DataFrame([m, M], columns=features_numeric, index=['min','max'])
        return Range

    
    def _scale(self,result,features_numeric):
        '''Scaling in between 0 to 1'''
        scaler = preprocessing.MinMaxScaler()
        temp = scaler.fit_transform(result[features_numeric])
        result[features_numeric] = pd.DataFrame(temp, columns=features_numeric)
        return result
    
    #GET TRAIN DATA
    ##################################
    def _sift(self, data, filter_size):
        '''fillter_size 이상인 채널 추출하기'''
        alive_idx = data['channel'].value_counts()>filter_size
        alive_array = alive_idx[alive_idx==True].index
        return data[data['channel'].isin(alive_array)].reset_index(drop=True)
    
    
    def _extract_train_data(self, filter_size=7, target_size=1, stride=1, drop_features=None, targets=None):
        #remove channels with few information with respect to filter_size and target_size to extract
        data = self._sift(self.result, filter_size + target_size)
        
        if drop_features is None:
            drop_features = ['date', 'genre','title', 'channel', 'description',	'sign_in', 'current_cumul_view', 'current_n_video', 'current_cumul_subs']
        if targets is None:
            targets = ['sub_diff']
        
        #return train, label set wrt groups
        result = data.groupby('channel').apply(lambda x: self._to_sequential(x, filter_size, target_size, stride, drop_features, targets)).reset_index(drop=True)
        return result
    
    @staticmethod
    def _to_sequential(data, filter_size, target_size, stride, drop_features, targets):
        data = data.reset_index(drop=True)
        idx_list = data.index.tolist()
        
        train, label = [],[]
        for i in range((len(idx_list)-filter_size-target_size)//stride +1):
            train_idx = idx_list[i*stride : i*stride + filter_size]
            label_idx = idx_list[i*stride + filter_size : i*stride + filter_size + target_size]
            train_temp = data.loc[train_idx,:].values.reshape(1,-1)
            label_temp = data.loc[label_idx,targets].values.reshape(1,-1)
            
            train = train_temp.copy() if i == 0 else np.vstack([train, train_temp])
            label = label_temp.copy() if i == 0 else np.vstack([label, label_temp])
            
        train = pd.DataFrame(train, columns = data.columns.tolist()*filter_size)
        label = pd.DataFrame(label, columns = targets*target_size)
        return train.drop(drop_features, axis=1), label

    def _combine(self, result):
        temp0, temp1 = [], []
        for i in range(len(result)):
            temp0.append(result[i][0])
            temp1.append(result[i][1])
        temp0 = pd.concat(temp0)
        temp1 = pd.concat(temp1)
        return (temp0, temp1)

    
    def _split_days(self, pred):
        idx_list = pred.columns.tolist()
        days = pred.shape[1]//len(idx_list)
        for i in range(days):
            pred[idx_list].iloc[:,i*L:(i+1)*L].apply(lambda x: _inverse_scale(x))
            
    def _inverse_scale(self, pred):
        idx_list = pred.columns.tolist()
        Min = self.Range[idx_list].loc['min']
        Max = self.Range[idx_list].loc['max']
        return (Max - Min)*pred[idx_list] + Min

    
class DataLoader(Preprocessor):
    def __init__(self, path):
        super().__init__()
        self.raw = pd.read_csv(path)
        self.result, self.scales = super()._fit_transform(self.raw.copy())
        
    def get_train_data(self, filter_size=7, target_size=1, stride=1, drop_features=None, targets=None):
        self._temp = super()._extract_train_data(filter_size, target_size, stride, drop_features, targets)
        self.train, self.label = super()._combine(self._temp)
    
    def get_channel(self, index):
        return self._temp[index][0], self._temp[index][1]

In [None]:
class DataLoader(Preprocessor):
    def __init__(self, path):
        super().__init__()
        self.raw = pd.read_csv(path)
        self.result, self.scales = super()._fit_transform(self.raw.copy())
        
    def get_data(self, filter_size=7, target_size=1, stride=1, drop_features=None, targets=None,
                by:str=None, channels:list=None, option:str='train', drop_id=, random_state):
        '''
        데이터를 추출하는 함수. 세팅에 따라 다양한 데이터 생성이 가능하다
        
        Args
        ---
        filter_size:
        targe_size:
        stride:
        drop_features:
        targets: list or str, 타깃 변수명, 여러 개일 경우 리스트로 작성 *label_features에서 변수명 바꿈
        by: 'channel' or None
           - 'channel' 입력: 'channels' 인자에 추출할 채널 인덱스 리스트를 채워줘야함
           - None 입력(default): 채널에 관계 없이 모든 데이터로부터 데이터 가공/생성
        channels: list, by='channel'로 설정될 경우에만 활성화
        option: 'train' or 'test'
           - 'train' 입력: 'targets' 인자로 설정된 타깃값을 달고 나옴
        drop_id: bool, id 컬럼을 제거할지 유무
        random_state: int, data split 시 사용되는 랜덤 시드
               
        Return
        ---
        data: 입맛대로 가공된 데이터
        '''
        if option == 'train':
            return dict(data=data, targets=targets)
        else:
            return dict(data=data)
    
    def get_data(self, filter_size=7, target_size=1, stride=1, drop_features=None, targets=None):
        self._temp = super()._extract_train_data(filter_size, target_size, stride, drop_features, targets)
        self.train, self.label = super()._combine(self._temp)
    
    def get_channel(self, index):
        return self._temp[index][0], self._temp[index][1]

### How to Use
-------------
1. Define Class
```
data = Loader(path)
```
2. Get Train Set
```
data = fn_get_train_data(filter_size, target_size, stride, drop_features, label_features)
```
<br/></br>

### NOTE
--------------
```data.train```: Train Set <br/> 
```data.label```: Label Set <br/>
```data.get_channel(index)```: Get Train Set and Label Set of wrt specific channel <br/>
```data.scales```: Original Scales

In [6]:
path = '/home/mskang/CapstoneUOS/raw/train_raw_LITE.csv'

In [7]:
data = after_finishing_this_project_lets_go_grab_some_drinks(path)

In [12]:
data.fn_get_train_data(60,30,60)

In [13]:
train, label = data.get_channel(10)

In [14]:
train

Unnamed: 0,duration,video_n_view,n_comment,n_like,n_dislike,daily_n_view,cumul_view,cumul_subs,is_upload,sub_diff,...,n_comment.1,n_like.1,n_dislike.1,daily_n_view.1,cumul_view.1,cumul_subs.1,is_upload.1,sub_diff.1,no_upload_interval,n_hashtage
0,,,,,,0.450422,0.00538026,0.0240708,0,,...,,,,0.454492,0.0216238,0.0340708,0,0.00999967,0.0,0
1,,,,,,0.451844,0.021939,0.0341593,0,0.00667752,...,,,,0.450422,0.0355262,0.0415044,0,0.00999967,0.0,0
2,,,,,,0.451274,0.0357575,0.0415044,0,0.00335537,...,,,,0.449866,0.0475445,0.0448673,0,0.00335537,0.0529412,0
3,,,,,,0.4499,0.0475738,0.0448673,0,0.00335537,...,,,,0.455364,0.0648272,0.0517699,0,0.0166772,0.00588235,0


In [15]:
label

Unnamed: 0,sub_diff,sub_diff.1,sub_diff.2,sub_diff.3,sub_diff.4,sub_diff.5,sub_diff.6,sub_diff.7,sub_diff.8,sub_diff.9,...,sub_diff.10,sub_diff.11,sub_diff.12,sub_diff.13,sub_diff.14,sub_diff.15,sub_diff.16,sub_diff.17,sub_diff.18,sub_diff.19
0,0.006678,0.01,0.006678,0.003355,0.01,0.003355,0.006678,0.003355,0.006678,0.02661,...,0.01,0.003355,0.003355,0.033255,0.003355,0.013322,0.003355,0.003355,0.006678,0.016644
1,0.003355,0.006678,0.003355,0.003355,0.01,0.013322,0.006678,0.006644,0.003355,0.006711,...,0.003355,0.003355,0.003355,0.006678,0.003355,0.003355,0.016644,0.003355,0.003355,0.003355
2,0.003355,0.003355,0.016644,0.016644,0.003355,0.003355,0.006678,0.011661,0.011661,0.005016,...,0.003355,0.003355,0.003355,0.011661,0.011661,0.003355,0.005016,0.005016,0.003355,0.003355
3,0.006678,0.003355,0.005016,0.003355,0.005016,0.01,0.01,0.003355,0.003355,0.003355,...,0.004983,0.005016,0.013355,0.003355,0.003355,0.003355,0.003355,0.003355,0.006644,0.010033
