# 2 Feature Engineering

## 2-1 根據基本總分產生的初步TCRI等級

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [2]:
df = pd.read_csv("data/TCRI_2008_2017_prob.csv", encoding="utf-8")
df["c_0"] = df["c_0"].astype(float)  # 讀取後將欄位弄成float
df["c_1"] = df["c_1"].astype(float)
df["c_2"] = df["c_2"].astype(float)
df["c_3"] = df["c_3"].astype(float)
df["c_4"] = df["c_4"].astype(float)
df["TCRI"] = df["TCRI"].astype(float)
print(df.shape)
df.tail(2)

(2288, 20)


Unnamed: 0,公司,年月,TCRI,總分,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,平均收款天增減,平均售貨天增減,借款依存度%,營收,TSE產業別,c_0,c_1,c_2,c_3,c_4
2286,9943 好樂迪,2017/12/29,5.0,711.0,5.2,0,0.0,15.93,17.41,6.53,1.0,0.0,0.0,29.0,16,0.0,0.0,0.0,0.0,0.0
2287,9946 三發地產,2017/12/29,7.0,243.0,-8.71,1,7.49,3.83,20.42,2.51,9.0,1437.0,102.0,17.0,14,1.0,1.4331459999999998e-20,1.467837e-13,2.940595e-25,2.578725e-17


In [3]:
def transformBasicTCRILevel(value):
    level = 0
    if value < 165.0:
        level = 9
    elif value < 250.0:
        level = 8
    elif value < 335.0:
        level = 7
    elif value < 420.0:
        level = 6
    elif value < 505.0:
        level = 5
    elif value < 590.0:
        level = 4
    elif value < 675.0:
        level = 3
    elif value < 760.0:
        level = 2
    else:
        level = 1
    return level

In [4]:
basicTCRI = []
for score in df['總分']:
    basicTCRI.append(transformBasicTCRILevel(score))

In [5]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(basicTCRI, df['TCRI']))
print(rmse)

2.3138757483972596


In [6]:
# 用初步等級將總分替換
df.drop(['總分'], axis=1, inplace=True)
df['初步等級'] = basicTCRI
df.head()

Unnamed: 0,公司,年月,TCRI,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,平均收款天增減,平均售貨天增減,借款依存度%,營收,TSE產業別,c_0,c_1,c_2,c_3,c_4,初步等級
0,1101 台泥,2008/12/31,3.0,80.43,0,3.51,2.93,6.64,3.72,-2.0,-2.0,88.0,785.0,1,0.0,0.0,0.0,0.0,0.0,3
1,1201 味全,2008/12/31,5.0,10.75,0,0.54,4.62,4.04,3.27,-2.0,1.0,63.0,202.0,2,0.0,0.0,0.0,0.0,0.0,3
2,1210 大成,2008/12/31,4.0,23.25,0,0.42,5.93,1.96,4.42,-5.0,-9.0,43.0,712.0,2,0.0,0.0,0.0,0.0,0.0,2
3,1215 卜蜂,2008/12/31,4.0,1.77,0,0.27,5.49,2.37,3.73,0.0,-1.0,38.0,154.0,2,0.0,0.0,0.0,0.0,0.0,4
4,1216 統一,2008/12/31,4.0,178.78,0,1.38,4.02,4.95,3.56,-1.0,1.0,125.0,2997.0,2,0.0,0.0,0.0,0.0,0.0,3


## 2-2 門檻等級：規模

In [7]:
thresholdLevel = []
for i in range(len(df)):
    rev = df['營收'].iloc[i]
    level = df['初步等級'].iloc[i]
    threshold = 0
    
    # 營收不到5億，則不列入前5等；電子業門檻較高，門檻為10億
    if df['TSE產業別'].iloc[i] == 28: # 電子零組件業產業別代碼為28
        threshold = 10
    else:
        threshold = 5
    if rev < threshold and level <= 5:
        level = 6
        
    # 營收不到10億，則不列入前4等；電子業門檻較高，門檻為20億
    if df['TSE產業別'].iloc[i] == 28: # 電子零組件業產業別代碼為28
        threshold = 20
    else:
        threshold = 10
    if rev < threshold and level <= 4:
        level = 5
    
    # 營收不到30億，則不列入前3等；電子業門檻較高，門檻為40億
    if df['TSE產業別'].iloc[i] == 28: # 電子零組件業產業別代碼為28
        threshold = 40
    else:
        threshold = 30
    if rev < threshold and level <= 3:
        level = 4
        
    # 營收不到100億，則不列入前2等
    threshold = 100
    if rev < threshold and level <= 2:
        level = 3
        
    # 營收不到200億，則不列入前1等
    threshold = 200
    if rev < threshold and level <= 1:
        level = 2
    
    # 補充電子業門檻較高原因：受景氣影響較深，除非擁有關鍵技術或特殊的利基市場、獨門生意，否則一律適用較嚴格的規模門檻。
    
    thresholdLevel.append(level)

df['門檻等級_規模'] = thresholdLevel
df.head()

Unnamed: 0,公司,年月,TCRI,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,平均收款天增減,...,借款依存度%,營收,TSE產業別,c_0,c_1,c_2,c_3,c_4,初步等級,門檻等級_規模
0,1101 台泥,2008/12/31,3.0,80.43,0,3.51,2.93,6.64,3.72,-2.0,...,88.0,785.0,1,0.0,0.0,0.0,0.0,0.0,3,3
1,1201 味全,2008/12/31,5.0,10.75,0,0.54,4.62,4.04,3.27,-2.0,...,63.0,202.0,2,0.0,0.0,0.0,0.0,0.0,3,3
2,1210 大成,2008/12/31,4.0,23.25,0,0.42,5.93,1.96,4.42,-5.0,...,43.0,712.0,2,0.0,0.0,0.0,0.0,0.0,2,2
3,1215 卜蜂,2008/12/31,4.0,1.77,0,0.27,5.49,2.37,3.73,0.0,...,38.0,154.0,2,0.0,0.0,0.0,0.0,0.0,4,4
4,1216 統一,2008/12/31,4.0,178.78,0,1.38,4.02,4.95,3.56,-1.0,...,125.0,2997.0,2,0.0,0.0,0.0,0.0,0.0,3,3


## 2-2 APL 財務壓力計

### 2-2-1 Assets Management

In [8]:
# # 確認有無離群值
# plt.subplot(1, 2, 1)
# plt.boxplot(df['平均收款天增減'])
# plt.subplot(1, 2, 2)
# plt.boxplot(df['平均售貨天增減'])
# plt.tight_layout()

In [9]:
from scipy import stats
index_temp_1 = df[(np.abs(stats.zscore(df['平均收款天增減'])) > 3)].index
index_temp_2 = df[(np.abs(stats.zscore(df['平均售貨天增減'])) > 3)].index
# display(index_temp_1)
# display(index_temp_2)

# 聯集：讓index不重複
def find_union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2)) 
    return final_list

index_to_drop = find_union(index_temp_1, index_temp_2)

df.drop(index=index_to_drop, axis=1, inplace=True)
df.head(2)

Unnamed: 0,公司,年月,TCRI,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,平均收款天增減,...,借款依存度%,營收,TSE產業別,c_0,c_1,c_2,c_3,c_4,初步等級,門檻等級_規模
0,1101 台泥,2008/12/31,3.0,80.43,0,3.51,2.93,6.64,3.72,-2.0,...,88.0,785.0,1,0.0,0.0,0.0,0.0,0.0,3,3
1,1201 味全,2008/12/31,5.0,10.75,0,0.54,4.62,4.04,3.27,-2.0,...,63.0,202.0,2,0.0,0.0,0.0,0.0,0.0,3,3


In [10]:
# plt.subplot(1, 2, 1)
# plt.boxplot(df['平均收款天增減'])
# plt.subplot(1, 2, 2)
# plt.boxplot(df['平均售貨天增減'])
# plt.tight_layout()

In [11]:
# A1. 平均收帳天數比前一期慢 >= 20 天
# df[df['平均收款天增減'] >= 20]
A1 = []
for d in df['平均收款天增減']:
    if d >= 20:
        A1.append(int(d / 20))
    else:
        A1.append(0)
df['A1'] = A1

In [12]:
# A3. 平均售貨天數比前一期慢 >= 20 天
A3 = []
for d in df['平均售貨天增減']:
    if d >= 20:
        A3.append(int(d / 20))
    else:
        A3.append(0)
df['A3'] = A3

### 2-2-2 Profitability

In [13]:
# P1. 當期的總資產報酬率(ROA)不敷借款息
# df[df['資產報酬%'] - df['借款息%'] <= 0]
P1 = []
for i in range(len(df)):
    if df['資產報酬%'].iloc[i] - df['借款息%'].iloc[i] <= 0:
        P1.append(True)
    else:
        P1.append(False)

In [14]:
# P2. 當期的營業利益率(OP%)不足支應利息支出率(INT%)
# df[df['營業利益%'] - df['利息支出率'] <= 0]
P2 = []
for i in range(len(df)):
    if df['營業利益%'].iloc[i] - df['利息支出率'].iloc[i] <= 0:
        P2.append(True)
    else:
        P2.append(False)

In [15]:
'''
P發生的case
(1) P1且P2
(2) 連續3期P1（先略過）
'''
P_case1 = []
for i in range(len(df)):
    if P1[i] and P2[i]:
        P_case1.append(1)
    else:
        P_case1.append(0)
df['P'] = P_case1
df.head()

Unnamed: 0,公司,年月,TCRI,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,平均收款天增減,...,c_0,c_1,c_2,c_3,c_4,初步等級,門檻等級_規模,A1,A3,P
0,1101 台泥,2008/12/31,3.0,80.43,0,3.51,2.93,6.64,3.72,-2.0,...,0.0,0.0,0.0,0.0,0.0,3,3,0,0,0
1,1201 味全,2008/12/31,5.0,10.75,0,0.54,4.62,4.04,3.27,-2.0,...,0.0,0.0,0.0,0.0,0.0,3,3,0,0,0
2,1210 大成,2008/12/31,4.0,23.25,0,0.42,5.93,1.96,4.42,-5.0,...,0.0,0.0,0.0,0.0,0.0,2,2,0,0,0
3,1215 卜蜂,2008/12/31,4.0,1.77,0,0.27,5.49,2.37,3.73,0.0,...,0.0,0.0,0.0,0.0,0.0,4,4,0,0,0
4,1216 統一,2008/12/31,4.0,178.78,0,1.38,4.02,4.95,3.56,-1.0,...,0.0,0.0,0.0,0.0,0.0,3,3,0,0,0


### 2-2-3 Liquidity

In [16]:
# L1. 借款依存度(DE%)超過100%
# df[df['借款依存度%'] > 100]
L1 = []
for l in df['借款依存度%']:
    if l > 100:
        L1.append(1)
    else:
        L1.append(0)
df['L1'] = L1

In [17]:
# L3. CFO為負，不足資金 < -0.25億
# df[df['CFO'] < -0.25]
L3 = []
for l in df['CFO']:
    if l < 0.25:
        L3.append(1)
    else:
        L3.append(0)
df['L3'] = L3

### 把TCRI等級量化

In [18]:
df["score"] = 0.0
df.loc[df['TCRI'] ==  0.0, 'score'] = 100.0
df.loc[df['TCRI'] ==  9.0, 'score'] = 100.0
df.loc[df['TCRI'] ==  8.0, 'score'] = 200.0
df.loc[df['TCRI'] ==  7.0, 'score'] = 300.0
df.loc[df['TCRI'] ==  6.0, 'score'] = 400.0
df.loc[df['TCRI'] ==  5.0, 'score'] = 500.0
df.loc[df['TCRI'] ==  4.0, 'score'] = 600.0
df.loc[df['TCRI'] ==  3.0, 'score'] = 700.0
df.loc[df['TCRI'] ==  2.0, 'score'] = 800.0
df.loc[df['TCRI'] ==  1.0, 'score'] = 900.0

In [19]:
df.head()

Unnamed: 0,公司,年月,TCRI,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,平均收款天增減,...,c_3,c_4,初步等級,門檻等級_規模,A1,A3,P,L1,L3,score
0,1101 台泥,2008/12/31,3.0,80.43,0,3.51,2.93,6.64,3.72,-2.0,...,0.0,0.0,3,3,0,0,0,0,0,700.0
1,1201 味全,2008/12/31,5.0,10.75,0,0.54,4.62,4.04,3.27,-2.0,...,0.0,0.0,3,3,0,0,0,0,0,500.0
2,1210 大成,2008/12/31,4.0,23.25,0,0.42,5.93,1.96,4.42,-5.0,...,0.0,0.0,2,2,0,0,0,0,0,600.0
3,1215 卜蜂,2008/12/31,4.0,1.77,0,0.27,5.49,2.37,3.73,0.0,...,0.0,0.0,4,4,0,0,0,0,0,600.0
4,1216 統一,2008/12/31,4.0,178.78,0,1.38,4.02,4.95,3.56,-1.0,...,0.0,0.0,3,3,0,0,0,1,0,600.0


# 3 訓練前的準備

## 3-1 Resample imbalanced labels
https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets

In [20]:
df['TCRI'].value_counts()

4.0    509
5.0    440
6.0    398
7.0    270
3.0    206
8.0    168
9.0    102
0.0     80
2.0     45
1.0     40
Name: TCRI, dtype: int64

In [21]:
# df['TCRI'].value_counts().plot(kind='bar')

In [22]:
# Class count
count_TCRI_4 = 2*len(df[df['TCRI'] == 4])


# Divide by class
df_TCRI_0 = df[df['TCRI'] == 0.0]
df_TCRI_1 = df[df['TCRI'] == 1]
df_TCRI_2 = df[df['TCRI'] == 2]
df_TCRI_3 = df[df['TCRI'] == 3]
df_TCRI_4 = df[df['TCRI'] == 4]
df_TCRI_5 = df[df['TCRI'] == 5]
df_TCRI_6 = df[df['TCRI'] == 6]
df_TCRI_7 = df[df['TCRI'] == 7]
df_TCRI_8 = df[df['TCRI'] == 8]
df_TCRI_9 = df[df['TCRI'] == 9]

In [23]:
# 把每個類別都複製到跟4類別一樣多

df_TCRI_0_over = df_TCRI_0.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_1_over = df_TCRI_1.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_2_over = df_TCRI_2.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_3_over = df_TCRI_3.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_4_over = df_TCRI_4.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_5_over = df_TCRI_5.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_6_over = df_TCRI_6.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_7_over = df_TCRI_7.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_8_over = df_TCRI_8.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_9_over = df_TCRI_9.sample(n=count_TCRI_4, replace=True, random_state=2018, )

In [24]:
# 新的複製的data 合併
df_resample = pd.concat([df_TCRI_0_over, df_TCRI_1_over, df_TCRI_2_over, df_TCRI_3_over, df_TCRI_4_over, df_TCRI_5_over, df_TCRI_6_over, df_TCRI_7_over, df_TCRI_8_over, df_TCRI_9_over])
df_resample['TCRI'].value_counts()

9.0    1018
8.0    1018
7.0    1018
6.0    1018
5.0    1018
4.0    1018
3.0    1018
2.0    1018
1.0    1018
0.0    1018
Name: TCRI, dtype: int64

In [25]:
# df_resample['TCRI'].value_counts().plot(kind='bar')

# 4 訓練LinearRegression模型 求coef_

In [26]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, Dropout, Activation, Flatten, InputLayer
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### 沒resample 沒標準化 有c_0 c_1 c_2 c_3 c_4

In [47]:
Y = df['score'].values
X = df.drop(['score', 'TCRI', '公司', '年月'], axis = 1).values

print(X.shape)
print(Y.shape)

(2258, 23)
(2258,)


In [46]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 2.28844318e-02  1.06298644e+01  1.70288770e-01  3.12441942e-01
  1.02897646e-01 -6.88277444e+00  7.38613099e-01  4.99753168e-03
 -3.27501636e-02  1.75883776e-02  7.53962202e-03  6.25391973e-01
 -6.21879084e+00  8.45040940e-01 -2.17218844e+01 -2.17667066e+00
 -1.76267162e+01 -3.96918656e+01 -2.91135976e+01  3.36270736e-01
  1.22814863e+00 -2.20028811e+01 -1.37836596e+01]
CPU times: user 5.86 ms, sys: 373 µs, total: 6.23 ms
Wall time: 2.06 ms


### 沒resample 有標準化 有c_0 c_1 c_2 c_3 c_4

In [29]:
Y = df['score'].values
X = df.drop(['score', 'TCRI', '公司', '年月'], axis = 1).values

# 標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
X = minmax_scale.fit_transform(X)

print(X.shape)
print(Y.shape)

(2258, 23)
(2258,)


In [30]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 1.37009780e+02  1.06298644e+01  1.29249177e+01  5.36494058e+01
  1.12680125e+02 -1.54587114e+02  3.36807573e+02  8.60874808e+01
 -1.44624723e+02  2.13470139e+02  2.26188660e-01  1.29524789e+02
 -2.05236324e+02  9.23373048e+01 -1.29622936e+02 -7.37003842e+01
 -1.41013730e+02 -3.17534924e+02 -2.91135976e+02  1.38207273e+02
  1.22814863e+00 -2.20028811e+01 -1.37836596e+01]
CPU times: user 7.97 ms, sys: 864 µs, total: 8.84 ms
Wall time: 1.42 ms


### 有resample 沒標準化 有c_0 c_1 c_2 c_3 c_4

In [31]:
Y = df_resample['score'].values
X = df_resample.drop(['score', 'TCRI', '公司', '年月'], axis = 1).values

print(X.shape)
print(Y.shape)

(10180, 23)
(10180,)


In [32]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 1.11502237e-02 -8.40796146e+00  1.00939492e+00  5.01643534e-01
  8.29877254e-02 -8.02147154e+00  6.52339383e-01  1.05916382e-02
  3.26637418e-03  1.82134347e-02  7.15262655e-01 -1.02988682e-01
 -1.54418152e+01  1.88307867e+00 -2.00349455e+01  1.83070262e+00
 -2.16320306e+00 -7.33439152e+01 -2.32209811e+01  3.86740771e-01
  1.65606357e+01 -1.76852358e+01 -2.60885502e+00]
CPU times: user 19.7 ms, sys: 3.6 ms, total: 23.3 ms
Wall time: 3.92 ms


### 有resample 有標準化 有c_0 c_1 c_2 c_3 c_4

In [33]:
Y = df_resample['score'].values
X = df_resample.drop(['score', 'TCRI', '公司', '年月'], axis = 1).values

# 標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
X = minmax_scale.fit_transform(X)

print(X.shape)
print(Y.shape)

(10180, 23)
(10180,)


In [34]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[  66.58199981   -8.40796146   76.61307461   86.13721114   84.97943084
 -180.16225075  297.46675872  182.45155959   14.4243084   221.0564571
   21.45787965  -21.32996246 -509.62019342  205.76329648 -119.5563155
   61.98617428  -17.30562448 -586.75132124 -232.20981126  158.95045676
   16.56063568  -17.68523579   -2.60885502]
CPU times: user 13.5 ms, sys: 754 µs, total: 14.3 ms
Wall time: 2.31 ms


### 沒resample 沒標準化 沒c_0 c_1 c_2 c_3 c_4

In [35]:
Y = df['score'].values
X = df.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).values

print(X.shape)
print(Y.shape)

(2258, 18)
(2258,)


In [36]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 2.70170136e-02  1.22036858e+01  2.75892486e-02  3.18932192e-01
  9.08556950e-02 -6.78901129e+00  7.30684945e-01  4.85585436e-03
 -3.40231517e-02  1.70081585e-02  6.78788202e-02 -1.74844630e+01
 -4.02646778e+01 -2.85807934e+01  3.46574387e-01  1.47609731e+00
 -2.20799771e+01 -1.31972614e+01]
CPU times: user 8.36 ms, sys: 0 ns, total: 8.36 ms
Wall time: 1.33 ms


### 沒resample 有標準化 沒c_0 c_1 c_2 c_3 c_4

In [37]:
Y = df['score'].values
X = df.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).values

# 標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
X = minmax_scale.fit_transform(X)

print(X.shape)
print(Y.shape)

(2258, 18)
(2258,)


In [38]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 161.75167113   12.2036858     2.09402397   54.76384668   99.49334594
 -152.48119366  333.19233472   83.64694725 -150.24623787  206.4280202
    2.03636461 -139.87570431 -322.11742233 -285.8079344   142.44207324
    1.47609731  -22.0799771   -13.19726142]
CPU times: user 7.3 ms, sys: 370 µs, total: 7.67 ms
Wall time: 1.21 ms


### 有resample 沒標準化 沒c_0 c_1 c_2 c_3 c_4

In [39]:
Y = df_resample['score'].values
X = df_resample.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).values

print(X.shape)
print(Y.shape)

(10180, 18)
(10180,)


In [40]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 1.47525979e-02 -1.22406761e+01  9.28967768e-01  7.06466824e-01
  7.51550151e-02 -8.11850771e+00  6.62001458e-01  1.01562051e-02
  4.86098072e-03  1.78183453e-02  8.21062001e-01 -8.05504584e-01
 -7.52985855e+01 -2.27241585e+01  4.17611623e-01  1.81349902e+01
 -2.12272724e+01  2.49924141e+00]
CPU times: user 17.4 ms, sys: 0 ns, total: 17.4 ms
Wall time: 2.83 ms


### 有resample 有標準化 沒c_0 c_1 c_2 c_3 c_4

In [41]:
Y = df_resample['score'].values
X = df_resample.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).values

# 標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
X = minmax_scale.fit_transform(X)

print(X.shape)
print(Y.shape)

(10180, 18)
(10180,)


In [42]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[  88.09307285  -12.24067614   70.50865356  121.30741832   76.95873548
 -182.34168309  301.87266498  174.95078851   21.46609087  216.26125733
   24.63186004   -6.44403668 -602.38868423 -227.24158541  171.63837689
   18.13499015  -21.22727244    2.49924141]
CPU times: user 11.9 ms, sys: 46 µs, total: 12 ms
Wall time: 1.92 ms


In [48]:
df.columns.values

array(['公司', '年月', 'TCRI', 'CFO', 'CFO不一致', '利息支出率', '資產報酬%', '營業利益%',
       '借款息%', '平均收款天增減', '平均售貨天增減', '借款依存度%', '營收', 'TSE產業別', 'c_0',
       'c_1', 'c_2', 'c_3', 'c_4', '初步等級', '門檻等級_規模', 'A1', 'A3', 'P',
       'L1', 'L3', 'score'], dtype=object)

In [49]:
df.drop(['score', 'TCRI', '公司', '年月'], axis = 1).columns.values

array(['CFO', 'CFO不一致', '利息支出率', '資產報酬%', '營業利益%', '借款息%', '平均收款天增減',
       '平均售貨天增減', '借款依存度%', '營收', 'TSE產業別', 'c_0', 'c_1', 'c_2', 'c_3',
       'c_4', '初步等級', '門檻等級_規模', 'A1', 'A3', 'P', 'L1', 'L3'],
      dtype=object)

In [50]:
df.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).columns.values

array(['CFO', 'CFO不一致', '利息支出率', '資產報酬%', '營業利益%', '借款息%', '平均收款天增減',
       '平均售貨天增減', '借款依存度%', '營收', 'TSE產業別', '初步等級', '門檻等級_規模', 'A1', 'A3',
       'P', 'L1', 'L3'], dtype=object)