# Summary 
<br/></br>
### How to Use
-------------
0. Load Data
```
raw = pd.read_csv(path)
```
    
1. Define Preprocessor
```
preprocessor = Preprocessor()
```
2. Transform
```
data = preprocessor.fit_transform(raw)
```    
3. Get Train Data and Label
```
result = preprocessor.get_train_data(data, filter_size, target_size, stride)
```                                                   
<br/></br>

### How It Works
---------------
The following image is a diagram of ```get_train_data```, particulary, ```data.groupby('channel').apply```
![get_train_data](./md_img/get_train_data.jpg)

<br/> As an intepretation of the ```result```, refer the image below.</br>
![result](./md_img/result.jpg)

In [1]:
import os
from datetime import datetime, date
from sklearn import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

### Preprocessor
---------------------

In [2]:
class Preprocessor:
    def __init__(self):
        self.IS_TRANSFORM = False

    def fit_transform(self, raw, return_output=True):
        #initialize variables
        self.raw = raw
        self.result = self.raw.copy()
        self.features_not_numeric = ['channel', 'title', 'genre','description','date', 'sign_in','is_upload']

        #adding more features
        self.__n_comment_to_float()
        self.__str_to_datetype()
        self.__add_is_upload()
        self.__add_sub_diff()
        self.__add_no_upload_interval()
        self.__add_n_hashtag()
        
        #saving min, max values & scale numerical features
        self.features_numeric = self.result.drop(self.features_not_numeric, axis=1).columns.tolist()
        self.__get_min_max_values()
        self.__scale()
        
        #flag to notice that the process has completed and return
        self.IS_TRANSFORM = True
        return self.result if return_output else print('preprocess finished.')
    
    
    
    
    #ADD OR CONVERTING FEATURES
    ##################################
    def __n_comment_to_float(self):
        '''n_comment 칼럼을 float type으로 변환하고, 댓글사용중지는 0으로 변환'''
        self.result['n_comment'] = self.result['n_comment'].loc[(self.result['n_comment'] == '댓글 사용 중지')].apply(lambda x: 0)
        self.result['n_comment'] = self.result['n_comment'].astype(float)
        
        
    def __str_to_datetype(self):
        '''csv파일 로드시 date 컬럼이 str 타입으로 읽혀진 경우 이를 datetype으로 변환'''
        if pd.api.types.is_datetime64_ns_dtype(self.result['date']):
            pass
        else:
            self.result['date'] = pd.to_datetime(self.result['date'])


    def __add_is_upload(self):
        '''해당 날짜에 영상 업로드가 발생했는지(1) 하지않았는지(0)를 담은 변수 생성'''
        self.result = self.result.groupby('channel').apply(lambda x: self._get_is_upload(x)).reset_index(drop=True)
    @staticmethod
    def _get_is_upload(data):
        result = data.reset_index(drop=True)
        upload_idx = result[result['title'].notnull()].index.tolist()
        result['is_upload'] = 0
        result.loc[upload_idx, 'is_upload'] = 1
        return result
    
    
    def __add_sub_diff(self):
        '''일간 구독자 변화량 컬럼을 추가하는 함수''' # *(변rud)
        self.result = self.result.groupby('channel').apply(lambda x: self._whynot(x)).reset_index(drop=True)
    @staticmethod
    def _whynot(data):
        result = data.reset_index(drop=True)
        result['sub_diff'] = (result['cumul_subs'] - result['cumul_subs'].shift())
        return result
    
    
    def __add_no_upload_interval(self):
        self.result = self.result.groupby('channel').apply(lambda x: self._get_no_upload_interval(x)).reset_index(drop=True)
    @staticmethod
    def _get_no_upload_interval(data):
        result = data.reset_index(drop=True)
        upload_idx = result[result['is_upload'] == 1].index.tolist()
        temp = [0 for i in range(result.shape[0])]
        for i in range(len(upload_idx)):
            if i == len(upload_idx)-1:
                former = upload_idx[i]
                temp[former+1:] = [i+1 for i in range(len(temp[former+1:]))]
            else:
                former, latter = upload_idx[i], upload_idx[i+1]
                temp[former+1:latter] = [i+1 for i in range(len(temp[former+1:latter]))]
        result['no_upload_interval'] = temp
        return result
    
    
    def __add_n_hashtag(self):
        '''영상별 해시태그 개수를 담은 변수 생성(영상 미업로드시 0)'''
        self.result['n_hashtage'] = 0
        idx = self.result['description'].notnull()
        self.result.loc[idx, 'n_hashtage'] = self.result.loc[idx, 'description'].apply(lambda x: len(x.split('#'))-1)
    
    
    def __get_min_max_values(self):
        '''Saving min and max values prior to scaling'''
        m = (self.result[self.features_numeric]).min()
        M = (self.result[self.features_numeric]).max()
        self.Range = pd.DataFrame([m, M], columns=self.features_numeric, index=['min','max'])

    
    def __scale(self):
        '''Scaling in between 0 to 1'''
        scaler = preprocessing.MinMaxScaler()
        temp = scaler.fit_transform(self.result[self.features_numeric])
        self.result[self.features_numeric] = pd.DataFrame(temp, columns=self.features_numeric)
    
    
    
    
    #GET TRAIN DATA
    ##################################
    def _extract_at_least_filter(self, data, filter_size):
        '''fillter_size 이상인 채널 추출하기'''
        alive_idx = data['channel'].value_counts()>filter_size # filter_size 이상인애만 
        alive_array = alive_idx[alive_idx==True].index #살릴 channel들 array
        return data[data['channel'].isin(alive_array)].reset_index(drop=True)
    
    
    def get_train_data(self, data, filter_size=7, target_size=1, stride=1, drop_features=None, label_features=None):
        #warn in case fit_transform has not yet performed
        if self.IS_TRANSFORM==False:
            raise NotImplementedError("You need to run 'fit_transform' primarily.")
        
        #remove channels with few information with respect to filter_size and target_size to extract
        data = self._extract_at_least_filter(data, filter_size + target_size)
        
        #drop_features: features to drop for latter use
        #label_features: features to extract as labels
        if drop_features is None:
            drop_features = ['date', 'genre','title', 'channel', 'description',	'sign_in', 'current_cumul_view', 'current_n_video', 'current_cumul_subs']
        if label_features is None:
            label_features = ['sub_diff']
        
        #return train, label set wrt groups
        result = data.groupby('channel').apply(lambda x: self._to_sequential(x, filter_size, target_size, stride, drop_features, label_features)).reset_index(drop=True)
        return result
    @staticmethod
    def _to_sequential(data, filter_size, target_size, stride, drop_features, label_features):
        data = data.reset_index(drop=True)
        idx_list = data.index.tolist()
        
        train, label = [],[]
        for i in range((len(idx_list)-filter_size-target_size)//stride +1):
            train_idx = idx_list[i*stride : i*stride + filter_size]
            label_idx = idx_list[i*stride + filter_size : i*stride + filter_size + target_size]
            train_temp = data.loc[train_idx,:].values.reshape(1,-1).flatten()
            label_temp = data.loc[label_idx,label_features].values.reshape(1,-1).flatten()
            
            train = train_temp.copy() if i == 0 else np.vstack([train, train_temp])
            label = label_temp.copy() if i == 0 else np.vstack([label, label_temp])
            
        train = pd.DataFrame(train, columns = data.columns.tolist()*filter_size)
        label = pd.DataFrame(label, columns = label_features*target_size)
        return train.drop(drop_features, axis=1), label
    
    
    
    
    #INVERSE SCALE
    ##################################
    def split_days(self, pred):
        idx_list = pred.columns.tolist()
        days = pred.shape[1]//len(idx_list)
        for i in range(days):
            pred[idx_list].iloc[:,i*L:(i+1)*L].apply(lambda x: _inverse_scale(x))
            
    def _inverse_scale(self, pred):
        idx_list = pred.columns.tolist()
        Min = self.Range[idx_list].loc['min']
        Max = self.Range[idx_list].loc['max']
        return (Max-Min)*pred[idx_list]+Min

### Execution
------------

In [3]:
lite_path = '/home/mskang/CapstoneUOS/raw/train_raw_LITE.csv'
full_path = '/home/mskang/CapstoneUOS/raw/meta_trend_data(201130).csv'
lite = pd.read_csv(lite_path)
full = pd.read_csv(full_path)

In [4]:
preprocessor = Preprocessor()

In [5]:
lite_data = preprocessor.fit_transform(lite)

In [6]:
temp_features = ['date', 'genre','title', 'channel', 'description',	'sign_in', 'current_cumul_view', 'current_n_video', 'current_cumul_subs']

In [7]:
features = lite_data.drop(temp_features, axis=1).columns.tolist()

In [84]:
features = ['duration', 'n_like']

In [78]:
len(features)

12

In [85]:
result = preprocessor.get_train_data(lite_data, 180, 30, 1, label_features=features)

In [87]:
result[3][0] # X

Unnamed: 0,duration,video_n_view,n_comment,n_like,n_dislike,daily_n_view,cumul_view,cumul_subs,is_upload,sub_diff,...,n_comment.1,n_like.1,n_dislike.1,daily_n_view.1,cumul_view.1,cumul_subs.1,is_upload.1,sub_diff.1,no_upload_interval,n_hashtage
0,,,,,,0.449933,0.0093646,0.0099115,0,,...,,0.00584302,0.0128067,0.450461,0.0204615,0.0206195,1,0.00335537,0,0.0342857
1,,,,,,0.449933,0.00939874,0.00999912,0,0.0066443,...,,0.0107806,0.0100293,0.450743,0.0206148,0.020708,1,0.00667752,0,0.0285714
2,,,,,,0.449807,0.00941439,0.00999912,0,0.00335537,...,,,,0.45071,0.0207633,0.0207965,0,0.00667752,0.00588235,0
3,,,,,,0.449851,0.00943646,0.00999912,0,0.00335537,...,,,,0.450227,0.0208407,0.0209735,0,0.00999967,0.0117647,0
4,,,,,,0.449849,0.00945829,0.00999912,0,0.00335537,...,,,,0.450227,0.0209181,0.0209735,0,0.00335537,0.0176471,0
5,,,,,,0.449827,0.00947689,0.00999912,0,0.00335537,...,,0.000244883,0.000308594,0.450162,0.020986,0.0210619,1,0.00667752,0,0.0457143
6,,,,,,0.449832,0.00949613,0.00999912,0,0.00335537,...,,,,0.449956,0.0210235,0.0210619,0,0.00335537,0.00588235,0
7,,,,,,0.449832,0.00951538,0.00999912,0,0.00335537,...,,0.000153764,0.000154297,0.450356,0.0211198,0.0210619,1,0.00335537,0,0.0285714
8,0.0152925,0.000355274,,0.000410037,0.000154297,0.449773,0.00952596,0.0100434,1,0.00501644,...,,0.00284748,0.00848634,0.450356,0.0211198,0.0210619,1,0.00335537,0,0.0285714
9,,,,,,0.449946,0.00956208,0.0100876,0,0.00501644,...,,8.54243e-05,0.000925783,0.450065,0.0211735,0.0212389,1,0.00999967,0,0.0342857


In [86]:
result[3][1] # Label

Unnamed: 0,duration,n_like,duration.1,n_like.1,duration.2,n_like.2,duration.3,n_like.3,duration.4,n_like.4,...,duration.5,n_like.5,duration.6,n_like.6,duration.7,n_like.7,duration.8,n_like.8,duration.9,n_like.9
0,0.017951,0.010781,,,,,,,0.017573,0.000245,...,,,,,,,0.020050,0.002181,0.024709,0.003952
1,,,,,,,0.017573,0.000245,,,...,,,,,0.020050,0.002181,0.024709,0.003952,0.014215,0.001623
2,,,,,0.017573,0.000245,,,0.005261,0.000154,...,,,0.020050,0.002181,0.024709,0.003952,0.014215,0.001623,,
3,,,0.017573,0.000245,,,0.005261,0.000154,0.011277,0.002847,...,0.020050,0.002181,0.024709,0.003952,0.014215,0.001623,,,,
4,0.017573,0.000245,,,0.005261,0.000154,0.011277,0.002847,0.014915,0.000085,...,0.024709,0.003952,0.014215,0.001623,,,,,,
5,,,0.005261,0.000154,0.011277,0.002847,0.014915,0.000085,0.016510,0.003480,...,0.014215,0.001623,,,,,,,,
6,0.005261,0.000154,0.011277,0.002847,0.014915,0.000085,0.016510,0.003480,,,...,,,,,,,,,,
7,0.011277,0.002847,0.014915,0.000085,0.016510,0.003480,,,0.023575,0.016544,...,,,,,,,,,0.016342,0.007204
8,0.014915,0.000085,0.016510,0.003480,,,0.023575,0.016544,0.005289,0.000171,...,,,,,,,0.016342,0.007204,0.014635,0.003218
9,0.016510,0.003480,,,0.023575,0.016544,0.005289,0.000171,0.008717,0.006321,...,,,,,0.016342,0.007204,0.014635,0.003218,0.017069,0.017501


### Pending
-------------------

In [None]:
    def merge_in_samedate(self): 
        "같은 날에 올린 영상 여러개 하루치로 만들기"
        
        # 1. feature 추가 - 하루에 올린 영상 개수 (안 올린날은 0)
        df_video_num=self.add_video_num_in_same_days() #  하루 올린 영상 개수에 대한 series
        
       # 2. 모든 feature에 대해 하루로 합치기  -->  하루에 올린 3개의 영상을 1개의 영상으로 합친다 (평균이용, 분산feature 추가)
        col_str=['sign_in','title','description','genre'] # 문자열 feature들
        col_std_mean=['duration','video_n_view', 'n_comment', 'n_like', 'n_dislike'] # 평균 + 표준편차까지 추가 할 feature
        col_residue=[x for x in list(self.result.columns) if x not in col_str+col_std_mean+['channel','date']] #그 외 ex-누적 조회수, 일일조회수, 업로드 간격 등
        
        df_str=self.result.groupby(['channel','date'])[col_str].first() #문자열 feature들은 첫번째 동영상을 따른다 (수정 필요할 수도)
        df_std=self.result.groupby(['channel','date'])[col_std_mean].std() # 표준편차에 대한 featrue 생성
        df_mean=self.result.groupby(['channel','date'])[col_std_mean].mean() # 평균으로 feature 생성
        df_residue=self.result.groupby(['channel','date'])[col_residue].mean() # 그 외 feature ex)누적조회수는 같은 날짜 내에서 모두 동일
        
        df_std.columns=[x+'_std' for x in list(df_std.columns)] # feature 이름 수정('~~_std')
        
        col_array=col_str+[df_video_num.name]+[*sum(zip(list(df_mean.columns),list(df_std.columns)),())]+col_residue # feature 순서 섞기 + 조정
        self.result=pd.concat([df_str,df_video_num,df_mean,df_std,df_residue],axis=1)
        self.result=self.result[col_array] # 순서 정렬
        self.result=self.result.reset_index() # index 돌려놓기.
    
        
    def add_video_num_in_same_days(self):
        "하루에 올린 영상 개수"
        grouped=self.result.groupby(['channel','date'])
        video_num= grouped.size() # 하루에 올린 영상 개수에 대한 Series, 
        video_num[grouped['title'].apply(lambda x : x.values[0]).isnull().values]=0 #  동영상 없는 채널은 0으로. 
        video_num.name='video_num_per_day'
        return video_num # 하루에 올린 영상 개수에 대한 Series를 return값으로. 

    def add_title_length(self):
        "제목 길이"
        self.result['title_length']=self.result['title'].apply(lambda x : len(x) if x is not np.nan else (np.nan))

    def add_like_per_view(self):
        " 영상 호감도 - 조회수 대비 좋아요 수  - 단순히 좋아하는 사람의 비율 : 구독자 상승은 좋아하는 사람만 많으면 됨"
        self.result['like_per_view']=self.result['n_like']/self.result['video_n_view']
        
    def add_dislike_per_view(self):
        "영상 비호감도 - 조회수 대비 싫어요 수 - 단순히 싫어하는 사람의 비율"
        self.result['dislike_per_view']=self.result['n_dislike']/self.result['video_n_view']
    
    def add_interest_per_view(self):
        " 영상 참여도 - 조회수 대비 (좋아요+싫어요+댓글) 수 - 영상에 관심이 있는 정도"
        self.result['interest_per_view']=(self.result['n_like']+self.result['n_dislike']+self.result['n_comment'])/self.result['video_n_view']
        

# TESTER
### Inverse Scaler
---------------
Ignore this block if you are not me!

In [None]:
test = Preprocessor()

In [None]:
temp_data = test.fit_transform(raw)

In [None]:
test.Range

In [None]:
label = result[0][0].iloc[0,:]

In [None]:
len(label.index.unique().tolist())

# BUG #1
-------------

In [15]:
def test(data, filter_size, target_size, stride, drop_features, label_features):
        data = data.reset_index(drop=True)
        idx_list = data.index.tolist()
        
        train, label = [],[]
        for i in range((len(idx_list)-filter_size-target_size)//stride +1):
            train_idx = idx_list[i*stride : i*stride + filter_size]
            label_idx = idx_list[i*stride + filter_size : i*stride + filter_size + target_size]
            train_temp = data.loc[train_idx,:].values.reshape(1,-1).flatten()
            label_temp = data.loc[label_idx,label_features].values.reshape(1,-1).flatten()
            
            train = train_temp.copy() if i == 0 else np.vstack([train, train_temp])
            label = label_temp.copy() if i == 0 else np.vstack([label, label_temp])
            
        train = pd.DataFrame(train, columns = data.columns.tolist()*filter_size)
        label = pd.DataFrame(label, columns = label_features*target_size)
        return train.drop(drop_features, axis=1), label
    
drop_features = ['date', 'genre','title', 'channel', 'description',	'sign_in', 'current_cumul_view', 'current_n_video', 'current_cumul_subs']
label_features = ['sub_diff']

In [11]:
full_data = preprocessor.fit_transform(full)

In [12]:
channel_list = full_data.channel.unique()

In [16]:
bug_list = []
for i, name in enumerate(channel_list):
    try:
        train, label = test(full_data.loc[full_data.channel==name], 30,1,5, drop_features, label_features)
    except:
        bug_list.append(i)
        continue

In [17]:
bug_list

[11, 401, 652]

In [21]:
bug1 = full_data.loc[full_data.channel==channel_list[bug_list[0]]]
bug2 = full_data.loc[full_data.channel==channel_list[bug_list[1]]]
bug3 = full_data.loc[full_data.channel==channel_list[bug_list[2]]]

In [25]:
print(f'BUG1:{bug1.shape}\nBUG2:{bug2.shape}\nBUG3:{bug3.shape}')

BUG1:(35, 21)
BUG2:(31, 21)
BUG3:(31, 21)


In [52]:
normal = full_data.loc[full_data.channel==channel_list[31]]

In [26]:
test(bug1, 30,1,5, drop_features, label_features)

ValueError: Shape of passed values is (1, 630), indices imply (630, 630)

In [66]:
temp = bug1.copy()
#temp = normal.copy()

In [74]:
temp = temp.reset_index(drop=True)
idx_list = temp.index.tolist()

train, label = [],[]
for i in range((len(idx_list)-30-1)//5 +1):
    train_idx = idx_list[i*5 : i*5 + 30]
    label_idx = idx_list[i*5 + 30 : i*5 + 30 + 1]
    train_temp = temp.loc[train_idx,:].values.flatten().reshape(1,-1)
    label_temp = temp.loc[label_idx,label_features].values.flatten().reshape(1,-1)

    train = train_temp.copy() if i == 0 else np.vstack([train, train_temp])
    label = label_temp.copy() if i == 0 else np.vstack([label, label_temp])

train = pd.DataFrame(train, columns = temp.columns.tolist()*30)
#train = pd.DataFrame(train)
label = pd.DataFrame(label, columns = label_features*1)
#train = train.drop(drop_features, axis=1)
drop_features = ['date', 'genre','title', 'channel', 'description',	'sign_in', 'current_cumul_view', 'current_n_video', 'current_cumul_subs']
label_features = ['sub_diff']

In [75]:
train.shape

(1, 630)

In [76]:
train

Unnamed: 0,channel,sign_in,date,title,genre,duration,video_n_view,description,n_comment,n_like,...,daily_n_view,cumul_view,cumul_subs,current_cumul_view,current_n_video,current_cumul_subs,is_upload,sub_diff,no_upload_interval,n_hashtage
0,9SCT -PROD. SALINAS-,2019-8-1,2019-11-30 00:00:00,Amén - Salinas (Prod musical y audiovisual 9sct),Music,0.00193004,6.49892e-07,Letra / salinas\ninstrumental / internet\nprod...,,9.62798e-06,...,0.755431,0.00127991,2.61905e-05,3.79113e-06,0.00163452,8.51502e-06,0,0.162068,0.0830946,0
