# AI YouTube Consultant: Preprocessor
<br/></br>
## Manual
* Load and Transform
```
preprocessor = Preprocessor()
data = preprocessor.fit_transform(raw=raw)
```
* Generate train data
```
result = preprocessor.get_train_data(data, filter_size, target_size, stride)
```                  
* TODO: Generate test data


In [3]:
import os
from datetime import datetime, date
from enum import Enum

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn import preprocessing

In [36]:
path = '../../data/train_raw_LITE.csv'
raw = pd.read_csv(path)

In [37]:
worker = Preprocessor()
worker.fit_transform(raw)

Preprocess finished.


In [40]:
train = worker.get_train_data(filter_size=7, target_size=1, stride=2)

In [41]:
result = pd.DataFrame()
for i in range(50):
    temp = train[0][0]
    temp['target'] = train[0][1].values
    result = pd.concat([result, temp], axis=0, ignore_index=True)

In [42]:
result.to_csv('train_LITE.csv', index=False)

In [35]:
class Preprocessor:
    def __init__(self):
        self.IS_TRANSFORM = False
        self.logs = dict(added_features=[], scaling_method=[]) # TODO: 트랜스폼 뭐 일어났는지 기록을 담는 곳

    def fit_transform(self, raw, return_output=False):
        self.raw = raw
        self.result = self.raw.copy()
        self.object_fts = ['channel', 'title', 'genre','description','date', 'sign_in','is_upload']

        #adding more features
        self.__n_comment_to_float()
        self.__str_to_datetype()
        self.__add_is_upload()
        self.__add_sub_diff()
        self.__add_no_upload_interval()
        self.__add_n_hashtag()
        
        #saving min, max values & scale numerical features
        self.numeric_fts = self.result.drop(self.object_fts, axis=1).columns.tolist()
        self.__get_min_max_values()
        self.__scale()
        
        #flag to notice that the process has completed and return
        self.IS_TRANSFORM = True
        
        print('Preprocess finished.')
        if return_output:
            return self.result
    
    
    #ADD OR CONVERTING FEATURES
    ##################################
    def __n_comment_to_float(self):
        '''n_comment 칼럼을 float type으로 변환하고, 댓글사용중지는 0으로 변환'''
        self.result.loc[raw['n_comment']=='댓글 사용 중지', 'n_comment'] = 0
        self.result['n_comment'] = self.result['n_comment'].astype(float)
        
        
    def __str_to_datetype(self):
        '''csv파일 로드시 date 컬럼이 str 타입으로 읽혀진 경우 이를 datetype으로 변환'''
        if pd.api.types.is_datetime64_ns_dtype(self.result['date']):
            pass
        else:
            self.result['date'] = pd.to_datetime(self.result['date'])


    def __add_is_upload(self):
        '''해당 날짜에 영상 업로드가 발생했는지(1) 하지않았는지(0)를 담은 변수 생성'''
        self.result = self.result.groupby('channel').apply(lambda x: self._get_is_upload(x)).reset_index(drop=True)
        self.logs['added_features'].append('is_upload')
        
    @staticmethod
    def _get_is_upload(data):
        result = data.reset_index(drop=True)
        upload_idx = result[result['title'].notnull()].index.tolist()
        result['is_upload'] = 0
        result.loc[upload_idx, 'is_upload'] = 1
        return result
    
    
    def __add_sub_diff(self):
        '''일간 구독자 변화량 컬럼을 추가하는 함수'''
        self.result = self.result.groupby('channel').apply(lambda x: self._whynot(x)).reset_index(drop=True)
        self.logs['added_features'].append('sub_diff')
        
    @staticmethod
    def _whynot(data):
        result = data.reset_index(drop=True)
        result['sub_diff'] = (result['cumul_subs'] - result['cumul_subs'].shift())
        return result
    
    
    def __add_no_upload_interval(self):
        self.result = self.result.groupby('channel').apply(lambda x: self._get_no_upload_interval(x)).reset_index(drop=True)
        self.logs['added_features'].append('no_upload_interval')
        
    @staticmethod
    def _get_no_upload_interval(data):
        result = data.reset_index(drop=True)
        upload_idx = result[result['is_upload'] == 1].index.tolist()
        temp = [0 for i in range(result.shape[0])]
        for i in range(len(upload_idx)):
            if i == len(upload_idx)-1:
                former = upload_idx[i]
                temp[former+1:] = [i+1 for i in range(len(temp[former+1:]))]
            else:
                former, latter = upload_idx[i], upload_idx[i+1]
                temp[former+1:latter] = [i+1 for i in range(len(temp[former+1:latter]))]
        result['no_upload_interval'] = temp
        return result
    
    
    def __add_n_hashtag(self):
        '''영상별 해시태그 개수를 담은 변수 생성(영상 미업로드시 0)'''
        self.result['n_hashtag'] = 0
        idx = self.result['description'].notnull()
        self.result.loc[idx, 'n_hashtag'] = self.result.loc[idx, 'description'].apply(lambda x: len(x.split('#'))-1)
        self.logs['added_features'].append('n_hashtag')
    
    
    def __get_min_max_values(self):
        '''Saving min and max values prior to scaling'''
        m = (self.result[self.numeric_fts]).min()
        M = (self.result[self.numeric_fts]).max()
        self.meta = pd.DataFrame([m, M], columns=self.numeric_fts, index=['min','max'])

    
    def __scale(self):
        '''Scaling in between 0 to 1'''
        scaler = preprocessing.MinMaxScaler()
        temp = scaler.fit_transform(self.result[self.numeric_fts])
        self.result.loc[:, self.numeric_fts] = temp
        self.logs['scaling_method'] = 'minmax'
    
    
    #GET TRAIN DATA
    ##################################
    def get_train_data(self, data=None, filter_size=7, target_size=1, stride=1, drop_features=None, label_features=None):
        '''모델 학습 환경에 맞는 데이터를 생성하는 함수
        Args
        ---
        data: 데이터, None일 경우 클래스 내 result 인스턴스를 사용
        filter_size: 윈도우 사이즈를 7로 설정하여 데이터셋 생성
        target_size: 타깃 갯수를 설정
        stride: 필터의 stride
        drop_features: 제거할 변수, None일 경우 초기 설정된 변수를 제거
        label_features: 타깃 정보(인듯?)
        
        Return
        ---
        학습 데이터
        
        '''
        #warn in case fit_transform has not yet performed
        if self.IS_TRANSFORM==False:
            raise NotImplementedError("You need to run 'fit_transform' primarily.")
            
        # remove channels with few information with respect to filter_size and target_size to extract    
        if data is None:
            data = self._sift(self.result, filter_size + target_size)
        else:
            data = self._sift(data, filter_size + target_size)
        
        #drop_features: features to drop for latter use
        #label_features: features to extract as labels
        if drop_features is None:
            drop_features = ['date', 'genre','title', 'channel', 'description',	'sign_in', 'current_cumul_view', 'current_n_video', 'current_cumul_subs']
        if label_features is None:
            label_features = ['sub_diff']
        
        #return train, label set wrt groups
        result = data.groupby('channel').apply(lambda x: self._to_sequential(x, filter_size, target_size, stride, drop_features, label_features)).reset_index(drop=True)
        return result
    
    @staticmethod
    def _sift(data, filter_size):
        '''fillter_size 이상인 채널을 추출하는 함수'''
        alive_idx = data['channel'].value_counts() > filter_size # filter_size 이상인애만 
        alive_array = alive_idx[alive_idx==True].index #살릴 channel들 array
        return data[data['channel'].isin(alive_array)].reset_index(drop=True)
    
    @staticmethod
    def _to_sequential(data, filter_size, target_size, stride, drop_features, label_features):
        data = data.reset_index(drop=True)
        idx_list = data.index.tolist()
        
        train, label = [],[]
        for i in range((len(idx_list)-filter_size-target_size)//stride +1):
            train_idx = idx_list[i*stride : i*stride + filter_size]
            label_idx = idx_list[i*stride + filter_size : i*stride + filter_size + target_size]
            train_temp = data.loc[train_idx,:].values.reshape(1,-1)
            label_temp = data.loc[label_idx,label_features].values.reshape(1,-1)
            
            train = train_temp.copy() if i == 0 else np.vstack([train, train_temp])
            label = label_temp.copy() if i == 0 else np.vstack([label, label_temp])
            
        train = pd.DataFrame(train, columns = data.columns.tolist()*filter_size)
        label = pd.DataFrame(label, columns = label_features*target_size)
        return train.drop(drop_features, axis=1), label
    
    
    #INVERSE SCALE
    ##################################
    def split_days(self, pred):
        idx_list = pred.columns.tolist()
        days = pred.shape[1] // len(idx_list)
        for i in range(days):
            pred[idx_list].iloc[:, i*L : (i+1) * L].apply(lambda x: _inverse_scale(x))
            
    def _inverse_scale(self, pred):
        idx_list = pred.columns.tolist()
        Min = self.meta[idx_list].loc['min']
        Max = self.meta[idx_list].loc['max']
        return (Max - Min)*pred[idx_list] + Min

### Execution
------------

In [131]:
lite_path = '/home/mskang/CapstoneUOS/raw/train_raw_LITE.csv'
full_path = '/home/mskang/CapstoneUOS/raw/meta_trend_data(201130).csv'
lite = pd.read_csv(lite_path)
full = pd.read_csv(full_path)

In [132]:
preprocessor = Preprocessor()

In [133]:
lite_data = preprocessor.fit_transform(lite)
#full_data = preprocessor.fit_transform(full)

In [135]:
result = preprocessor.get_train_data(lite_data, 30, 10, 5)

### Pending
-------------------

In [None]:
    def merge_in_samedate(self): 
        "같은 날에 올린 영상 여러개 하루치로 만들기"
        
        # 1. feature 추가 - 하루에 올린 영상 개수 (안 올린날은 0)
        df_video_num=self.add_video_num_in_same_days() #  하루 올린 영상 개수에 대한 series
        
       # 2. 모든 feature에 대해 하루로 합치기  -->  하루에 올린 3개의 영상을 1개의 영상으로 합친다 (평균이용, 분산feature 추가)
        col_str=['sign_in','title','description','genre'] # 문자열 feature들
        col_std_mean=['duration','video_n_view', 'n_comment', 'n_like', 'n_dislike'] # 평균 + 표준편차까지 추가 할 feature
        col_residue=[x for x in list(self.result.columns) if x not in col_str+col_std_mean+['channel','date']] #그 외 ex-누적 조회수, 일일조회수, 업로드 간격 등
        
        df_str=self.result.groupby(['channel','date'])[col_str].first() #문자열 feature들은 첫번째 동영상을 따른다 (수정 필요할 수도)
        df_std=self.result.groupby(['channel','date'])[col_std_mean].std() # 표준편차에 대한 featrue 생성
        df_mean=self.result.groupby(['channel','date'])[col_std_mean].mean() # 평균으로 feature 생성
        df_residue=self.result.groupby(['channel','date'])[col_residue].mean() # 그 외 feature ex)누적조회수는 같은 날짜 내에서 모두 동일
        
        df_std.columns=[x+'_std' for x in list(df_std.columns)] # feature 이름 수정('~~_std')
        
        col_array=col_str+[df_video_num.name]+[*sum(zip(list(df_mean.columns),list(df_std.columns)),())]+col_residue # feature 순서 섞기 + 조정
        self.result=pd.concat([df_str,df_video_num,df_mean,df_std,df_residue],axis=1)
        self.result=self.result[col_array] # 순서 정렬
        self.result=self.result.reset_index() # index 돌려놓기.
    
        
    def add_video_num_in_same_days(self):
        "하루에 올린 영상 개수"
        grouped=self.result.groupby(['channel','date'])
        video_num= grouped.size() # 하루에 올린 영상 개수에 대한 Series, 
        video_num[grouped['title'].apply(lambda x : x.values[0]).isnull().values]=0 #  동영상 없는 채널은 0으로. 
        video_num.name='video_num_per_day'
        return video_num # 하루에 올린 영상 개수에 대한 Series를 return값으로. 

    def add_title_length(self):
        "제목 길이"
        self.result['title_length']=self.result['title'].apply(lambda x : len(x) if x is not np.nan else (np.nan))

    def add_like_per_view(self):
        " 영상 호감도 - 조회수 대비 좋아요 수  - 단순히 좋아하는 사람의 비율 : 구독자 상승은 좋아하는 사람만 많으면 됨"
        self.result['like_per_view']=self.result['n_like']/self.result['video_n_view']
        
    def add_dislike_per_view(self):
        "영상 비호감도 - 조회수 대비 싫어요 수 - 단순히 싫어하는 사람의 비율"
        self.result['dislike_per_view']=self.result['n_dislike']/self.result['video_n_view']
    
    def add_interest_per_view(self):
        " 영상 참여도 - 조회수 대비 (좋아요+싫어요+댓글) 수 - 영상에 관심이 있는 정도"
        self.result['interest_per_view']=(self.result['n_like']+self.result['n_dislike']+self.result['n_comment'])/self.result['video_n_view']
        

# TESTER
### Inverse Scaler
---------------

In [None]:
test = Preprocessor()

In [None]:
temp_data = test.fit_transform(raw)

In [None]:
test.Range

In [None]:
label = result[0][0].iloc[0,:]

In [None]:
len(label.index.unique().tolist())