In [1]:
import pandas as pd
import numpy as np
import os
import time
import xgboost
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from preprocessing import Nielsen

In [2]:
# 데이터 있는곳 위치
load_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))),'nielsen_pickle_data(180110)')
final_file_name = 'final_data_171215'

# seed
split_seed = 0
state_seed = 0

devide_points_list = [5.986000e+03, 1.439980e+04, 4.039190e+04]

In [45]:
def split_dataset(datatable, split_point_list):
    return np.split(datatable.sample(frac = 1, random_state = split_seed), split_point_list)

def print_duration(start):
    print('{}s 걸림'.format(time.time() - start))

def make_result_table(datatable, model):
    sub_table = datatable.copy()[['일자','프로그램명','프로그램편성시작시간','목표']]
    sub_table['예측']=model.predict(datatable.values[:,4:])
    return sub_table

def make_no_channel_data(data):
    no_channel_col_list = list()
    for col in data.columns:
        if '채널' not in col:
            no_channel_col_list.append(col)
    data_no_channel = data[no_channel_col_list]
    li = list()
    for col in data_no_channel.columns:
        if '2_' not in col:
            if '가중치' not in col:
                li.append(col)
    data_no_channel = data_no_channel[li]
    return data_no_channel

def result_table_error_rate(data, rate = 0.2):
    data['차이정도'] = data.apply(lambda x: np.abs((x['목표'] - x['예측']) / x['목표']), axis=1)
    return len(data[data['차이정도'] < rate]) / len(data)

def change_list_to_tuple(li):
    if li[0]!=0:
        li.insert(0, 0)
    li.append(100000000000000)
    tmp = list()
    for i in range(len(li)-1):
        tmp.append((li[i], li[i+1]))
    return tmp

def change2Class(x, tup):
    result = int()
    for idx, (a,b) in enumerate(tup):
        if a < x <= b:
            result = idx
    return result


def train_regression_model(string, model, dictionary):
    print('{}(regression)'.format(string))
    model.fit(dictionary['train'][0], dictionary['train'][1])
    for key in dictionary.keys():
        print("{}      :  {:,}".format(key, mean_squared_error(dictionary[key][1], model.predict(dictionary[key][0]))))
    return model


def train_classification_model(string, model, dictionary):
    print('{}(classification)'.format(string))
    model.fit(dictionary['train'][0], dictionary['train'][1])
    for key in dictionary.keys():
        print("{}      :  {:,}".format(key, accuracy_score(dictionary[key][1], model.predict(dictionary[key][0]))))

    return model

def regression(data, cut_rate=0.2):
    no_channel_data, d3_train, d3_validate, d3_test, d3_dict = data_preprocessing(data, bool_classification = False)

    ## RandomForest
    # regr = RandomForestRegressor(n_estimators=20, random_state=state_seed)
    # regr = train_regression_model('RandomForest', regr, dict)

    # xgboost
    xgb = xgboost.XGBRegressor(max_depth=10, random_state=0)
    xgb_result = train_regression_model('XgBoost', xgb, d3_dict)

    # result table
    result_table = make_result_table(d3_test, xgb)

    print('\n목표값과 +- rate가 {}인 경우, Accuracy: {}'.format(cut_rate, result_table_error_rate(result_table, cut_rate)))
    return xgb_result, result_table


def classification(data):
    # 그냥 지상파인지 여부만 체크

    no_channel_data, d3_train, d3_validate, d3_test, d3_dict = data_preprocessing(data, bool_classification = True)

    # classification 인 경우,
    ## RandomForest
    # randomForest = RandomForestClassifier(n_estimators=10, random_state=0)
    # randomForest_result = train_classification_model('RandomForest', randomForest, data_dictionary)
    # randomForest_result.predict_proba(data_dictionary['test'][0])

    ## AdaBoost
    # adaboost = AdaBoostClassifier(n_estimators=20, random_state=0)
    # adaboost_result = train_classification_model('Adaboost', adaboost, data_dictionary)
    # adaboost_result.predict_proba(data_dictionary['test'][0])

    ## XgBoost
    xbg = xgboost.XGBClassifier(max_depth=10, random_state=0)
    xbg_result = train_classification_model('XgBoost', xbg, d3_dict)
    # xbg_result.predict_proba(data_dictionary['test'][0])

    ##Voting 하는 경우,
    # voting = VotingClassifier(estimators=[
    #     ('random', randomForest)
    #     , ('ada', adaboost)
    #     , (' xgboost', xbg)
    # ]
    #     , voting='soft')
    # voting_result = train_classification_model('Voting', voting, data_dictionary)
    # voting_result.predict_proba(data_dictionary['test'][0])


    return xbg_result

def data_preprocessing(datatable, bool_classification=False):
    data = datatable.copy()
    data['지상파'] = data.apply(lambda x: 1 if x['채널_1'] + x['채널_2'] + x['채널_3'] + x['채널_4'] + x['채널_5'] == 1 else 0,
                             axis=1)
    # target 을 class 데이터로 바꿔줌
    if bool_classification:
        li = change_list_to_tuple(devide_points_list)
        data['목표'] = data['목표'].apply(lambda x: change2Class(x, li))

    no_channel_data = make_no_channel_data(data)

    # Data
    ## - train, validate, test 세개로 나눈 경우
    d3_train, d3_validate, d3_test = split_dataset(no_channel_data,
                                                   [int(.6 * len(no_channel_data)), int(.8 * len(no_channel_data))])

    d3_train_x, d3_train_y = d3_train.iloc[:, 4:].values, d3_train.iloc[:, 3].values
    d3_validate_x, d3_validate_y = d3_validate.iloc[:, 4:].values, d3_validate.iloc[:, 3].values
    d3_test_x, d3_test_y = d3_test.iloc[:, 4:].values, d3_test.iloc[:, 3].values

    d3_dict = dict(
        {'train': (d3_train_x, d3_train_y), 'validate': (d3_validate_x, d3_validate_y), 'test': (d3_test_x, d3_test_y)})

    return no_channel_data, d3_train, d3_validate, d3_test, d3_dict

def change_features(prog_from, prog_to, data_no_channel):
    prog_from_copy= prog_from.copy()
    prog_to_copy= prog_to.copy()
    change_col_list = list()
    for idx, col in enumerate(data_no_channel.columns):
        if ('가중치' in col) or ('성별' in col) or ('연령' in col):
            change_col_list.append(idx)
    for idx in change_col_list:
        prog_to_copy[idx] = prog_from_copy[idx]
    return prog_to_copy

In [46]:
# data load
print('0. data 로드')
data = pd.read_csv(os.path.join(load_path, '{}.csv'.format(final_file_name)), engine='python', encoding='cp949')
# XgBoost가 성능이 제일 잘 나와서 XgBoost만 사용
print('1. regression')
reg_result, reg_result_table = regression(data, 0.2)
print('2. classification')
cl_result = classification(data)

0. data 로드
1. regression
XgBoost(regression)
train      :  342,423,665.41295725
validate      :  1,567,003,610.9663174
test      :  1,535,932,658.2591827

목표값과 +- rate가 0.2인 경우, Accuracy: 0.2456987466662018
///////////////////
2. classification
XgBoost(classification)
train      :  0.716770965886311
validate      :  0.5453135077657887
test      :  0.5403803580455663


Regression 데이터

In [64]:
bool_classification = True
#classification인 경우, True
no_channel_data, train, validate, test, dictionary = data_preprocessing(data, bool_classification = bool_classification)
if bool_classification:
    model = cl_result
else:
    model = reg_result

In [65]:
# 백종원 3대천왕
sbs_a = no_channel_data[(no_channel_data['프로그램명']=='백종원의3대천왕')
               &(no_channel_data['지상파']==1)
               &(no_channel_data['일자']==20170408)].values
print('{} 개'.format(len(sbs_a)))
print(sbs_a[0][:4])
print(model.predict(sbs_a[0][4:].reshape(1,-1)))
if bool_classification:
    print(model.predict_proba(sbs_a[0][4:].reshape(1,-1)))

1 개
[20170408 '백종원의3대천왕' 71185 3]
[3]
[[  2.66055322e-05   2.61076857e-05   7.22554978e-05   9.99875069e-01]]


In [59]:
# 미운우리새끼
sbs_b = no_channel_data[(no_channel_data['프로그램명']=='미운우리새끼다시쓰는육아일기')
               &(no_channel_data['지상파']==1)
               &(no_channel_data['일자']==20170407)].values
print('{} 개'.format(len(sbs_b)))
print(sbs_b[0][:4])
print(model.predict(sbs_b[0][4:].reshape(1, -1)))
if bool_classification:
    print(model.predict_proba(sbs_b[0][4:].reshape(1, -1)))

1 개
[20170407 '미운우리새끼다시쓰는육아일기' 90081 1000196.0000000001]
[ 930544.375]


In [66]:
# K팝스타더라스트찬스
sbs_c = no_channel_data[(no_channel_data['프로그램명']=='K팝스타더라스트찬스')
               &(no_channel_data['지상파']==1)
               &(no_channel_data['일자']==20170409)].values
print('{} 개'.format(len(sbs_c)))
print(sbs_c[0][:4])
print(model.predict(sbs_c[0][4:].reshape(1, -1)))
if bool_classification:
    print(model.predict_proba(sbs_c[0][4:].reshape(1, -1)))

2 개
[20170409 'K팝스타더라스트찬스' 83536 3]
[3]
[[  3.69283553e-05   3.30287294e-05   5.44007744e-05   9.99875665e-01]]


sbs_a(3대천왕) 를 sbs_b(미운우리새끼) 시간대로 옮긴 경우,

In [67]:
result_1 = change_features(sbs_a[0], sbs_b[0], cl_no_channel_data)
print(model.predict(result_1[4:].reshape(1,-1)))
if bool_classification:
    print(model.predict_proba(result_1[4:].reshape(1,-1)))

[3]
[[  2.63467227e-05   3.14399949e-05   7.77135720e-05   9.99864459e-01]]


sbs_b(미운우리새끼) 를 sbs_c(K팝스타) 시간대로 옮긴 경우,

In [68]:
result_2 = change_features(sbs_b[0], sbs_c[0], cl_no_channel_data)
print(model.predict(result_2[4:].reshape(1,-1)))
if bool_classification:
    print(model.predict_proba(result_2[4:].reshape(1,-1)))

[3]
[[  3.85473541e-05   2.86694940e-05   7.03256883e-05   9.99862432e-01]]
