# 2 Feature Engineering

## 2-1 根據基本總分產生的初步TCRI等級

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [2]:
df = pd.read_csv("data/TCRI_2008_2017_prob.csv", encoding="utf-8")
df["c_0"] = df["c_0"].astype(float)  # 讀取後將欄位弄成float
df["c_1"] = df["c_1"].astype(float)
df["c_2"] = df["c_2"].astype(float)
df["c_3"] = df["c_3"].astype(float)
df["c_4"] = df["c_4"].astype(float)
df["TCRI"] = df["TCRI"].astype(float)
print(df.shape)
df.tail(2)

(2288, 20)


Unnamed: 0,公司,年月,TCRI,總分,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,平均收款天增減,平均售貨天增減,借款依存度%,營收,TSE產業別,c_0,c_1,c_2,c_3,c_4
2286,9943 好樂迪,2017/12/29,5.0,711.0,5.2,0,0.0,15.93,17.41,6.53,1.0,0.0,0.0,29.0,16,0.0,0.0,0.0,0.0,0.0
2287,9946 三發地產,2017/12/29,7.0,243.0,-8.71,1,7.49,3.83,20.42,2.51,9.0,1437.0,102.0,17.0,14,1.0,1.4331459999999998e-20,1.467837e-13,2.940595e-25,2.578725e-17


In [3]:
def transformBasicTCRILevel(value):
    level = 0
    if value < 165.0:
        level = 9
    elif value < 250.0:
        level = 8
    elif value < 335.0:
        level = 7
    elif value < 420.0:
        level = 6
    elif value < 505.0:
        level = 5
    elif value < 590.0:
        level = 4
    elif value < 675.0:
        level = 3
    elif value < 760.0:
        level = 2
    else:
        level = 1
    return level

In [4]:
basicTCRI = []
for score in df['總分']:
    basicTCRI.append(transformBasicTCRILevel(score))

In [5]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(basicTCRI, df['TCRI']))
print(rmse)

2.3138757483972596


In [6]:
# 用初步等級將總分替換
# df.drop(['總分'], axis=1, inplace=True)
df['初步等級'] = basicTCRI
df.head()

Unnamed: 0,公司,年月,TCRI,總分,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,...,平均售貨天增減,借款依存度%,營收,TSE產業別,c_0,c_1,c_2,c_3,c_4,初步等級
0,1101 台泥,2008/12/31,3.0,622.0,80.43,0,3.51,2.93,6.64,3.72,...,-2.0,88.0,785.0,1,0.0,0.0,0.0,0.0,0.0,3
1,1201 味全,2008/12/31,5.0,621.0,10.75,0,0.54,4.62,4.04,3.27,...,1.0,63.0,202.0,2,0.0,0.0,0.0,0.0,0.0,3
2,1210 大成,2008/12/31,4.0,719.0,23.25,0,0.42,5.93,1.96,4.42,...,-9.0,43.0,712.0,2,0.0,0.0,0.0,0.0,0.0,2
3,1215 卜蜂,2008/12/31,4.0,582.0,1.77,0,0.27,5.49,2.37,3.73,...,-1.0,38.0,154.0,2,0.0,0.0,0.0,0.0,0.0,4
4,1216 統一,2008/12/31,4.0,618.0,178.78,0,1.38,4.02,4.95,3.56,...,1.0,125.0,2997.0,2,0.0,0.0,0.0,0.0,0.0,3


## 2-2 門檻等級：規模

In [7]:
thresholdLevel = []
for i in range(len(df)):
    rev = df['營收'].iloc[i]
    level = df['初步等級'].iloc[i]
    threshold = 0
    
    # 營收不到5億，則不列入前5等；電子業門檻較高，門檻為10億
    if df['TSE產業別'].iloc[i] == 28: # 電子零組件業產業別代碼為28
        threshold = 10
    else:
        threshold = 5
    if rev < threshold and level <= 5:
        level = 6
        
    # 營收不到10億，則不列入前4等；電子業門檻較高，門檻為20億
    if df['TSE產業別'].iloc[i] == 28: # 電子零組件業產業別代碼為28
        threshold = 20
    else:
        threshold = 10
    if rev < threshold and level <= 4:
        level = 5
    
    # 營收不到30億，則不列入前3等；電子業門檻較高，門檻為40億
    if df['TSE產業別'].iloc[i] == 28: # 電子零組件業產業別代碼為28
        threshold = 40
    else:
        threshold = 30
    if rev < threshold and level <= 3:
        level = 4
        
    # 營收不到100億，則不列入前2等
    threshold = 100
    if rev < threshold and level <= 2:
        level = 3
        
    # 營收不到200億，則不列入前1等
    threshold = 200
    if rev < threshold and level <= 1:
        level = 2
    
    # 補充電子業門檻較高原因：受景氣影響較深，除非擁有關鍵技術或特殊的利基市場、獨門生意，否則一律適用較嚴格的規模門檻。
    
    thresholdLevel.append(level)

df['門檻等級_規模'] = thresholdLevel
df.head()

Unnamed: 0,公司,年月,TCRI,總分,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,...,借款依存度%,營收,TSE產業別,c_0,c_1,c_2,c_3,c_4,初步等級,門檻等級_規模
0,1101 台泥,2008/12/31,3.0,622.0,80.43,0,3.51,2.93,6.64,3.72,...,88.0,785.0,1,0.0,0.0,0.0,0.0,0.0,3,3
1,1201 味全,2008/12/31,5.0,621.0,10.75,0,0.54,4.62,4.04,3.27,...,63.0,202.0,2,0.0,0.0,0.0,0.0,0.0,3,3
2,1210 大成,2008/12/31,4.0,719.0,23.25,0,0.42,5.93,1.96,4.42,...,43.0,712.0,2,0.0,0.0,0.0,0.0,0.0,2,2
3,1215 卜蜂,2008/12/31,4.0,582.0,1.77,0,0.27,5.49,2.37,3.73,...,38.0,154.0,2,0.0,0.0,0.0,0.0,0.0,4,4
4,1216 統一,2008/12/31,4.0,618.0,178.78,0,1.38,4.02,4.95,3.56,...,125.0,2997.0,2,0.0,0.0,0.0,0.0,0.0,3,3


## 2-2 APL 財務壓力計

### 2-2-1 Assets Management

In [8]:
# # 確認有無離群值
# plt.subplot(1, 2, 1)
# plt.boxplot(df['平均收款天增減'])
# plt.subplot(1, 2, 2)
# plt.boxplot(df['平均售貨天增減'])
# plt.tight_layout()

In [9]:
from scipy import stats
index_temp_1 = df[(np.abs(stats.zscore(df['平均收款天增減'])) > 3)].index
index_temp_2 = df[(np.abs(stats.zscore(df['平均售貨天增減'])) > 3)].index
# display(index_temp_1)
# display(index_temp_2)

# 聯集：讓index不重複
def find_union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2)) 
    return final_list

index_to_drop = find_union(index_temp_1, index_temp_2)

df.drop(index=index_to_drop, axis=1, inplace=True)
df.head(2)

Unnamed: 0,公司,年月,TCRI,總分,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,...,借款依存度%,營收,TSE產業別,c_0,c_1,c_2,c_3,c_4,初步等級,門檻等級_規模
0,1101 台泥,2008/12/31,3.0,622.0,80.43,0,3.51,2.93,6.64,3.72,...,88.0,785.0,1,0.0,0.0,0.0,0.0,0.0,3,3
1,1201 味全,2008/12/31,5.0,621.0,10.75,0,0.54,4.62,4.04,3.27,...,63.0,202.0,2,0.0,0.0,0.0,0.0,0.0,3,3


In [10]:
# plt.subplot(1, 2, 1)
# plt.boxplot(df['平均收款天增減'])
# plt.subplot(1, 2, 2)
# plt.boxplot(df['平均售貨天增減'])
# plt.tight_layout()

In [11]:
# A1. 平均收帳天數比前一期慢 >= 20 天
# df[df['平均收款天增減'] >= 20]
A1 = []
for d in df['平均收款天增減']:
    if d >= 20:
        A1.append(int(d / 20))
    else:
        A1.append(0)
df['A1'] = A1

In [12]:
# A3. 平均售貨天數比前一期慢 >= 20 天
A3 = []
for d in df['平均售貨天增減']:
    if d >= 20:
        A3.append(int(d / 20))
    else:
        A3.append(0)
df['A3'] = A3

### 2-2-2 Profitability

In [13]:
# P1. 當期的總資產報酬率(ROA)不敷借款息
# df[df['資產報酬%'] - df['借款息%'] <= 0]
P1 = []
for i in range(len(df)):
    if df['資產報酬%'].iloc[i] - df['借款息%'].iloc[i] <= 0:
        P1.append(True)
    else:
        P1.append(False)

In [14]:
# P2. 當期的營業利益率(OP%)不足支應利息支出率(INT%)
# df[df['營業利益%'] - df['利息支出率'] <= 0]
P2 = []
for i in range(len(df)):
    if df['營業利益%'].iloc[i] - df['利息支出率'].iloc[i] <= 0:
        P2.append(True)
    else:
        P2.append(False)

In [15]:
'''
P發生的case
(1) P1且P2
(2) 連續3期P1（先略過）
'''
P_case1 = []
for i in range(len(df)):
    if P1[i] and P2[i]:
        P_case1.append(1)
    else:
        P_case1.append(0)
df['P'] = P_case1
df.head()

Unnamed: 0,公司,年月,TCRI,總分,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,...,c_0,c_1,c_2,c_3,c_4,初步等級,門檻等級_規模,A1,A3,P
0,1101 台泥,2008/12/31,3.0,622.0,80.43,0,3.51,2.93,6.64,3.72,...,0.0,0.0,0.0,0.0,0.0,3,3,0,0,0
1,1201 味全,2008/12/31,5.0,621.0,10.75,0,0.54,4.62,4.04,3.27,...,0.0,0.0,0.0,0.0,0.0,3,3,0,0,0
2,1210 大成,2008/12/31,4.0,719.0,23.25,0,0.42,5.93,1.96,4.42,...,0.0,0.0,0.0,0.0,0.0,2,2,0,0,0
3,1215 卜蜂,2008/12/31,4.0,582.0,1.77,0,0.27,5.49,2.37,3.73,...,0.0,0.0,0.0,0.0,0.0,4,4,0,0,0
4,1216 統一,2008/12/31,4.0,618.0,178.78,0,1.38,4.02,4.95,3.56,...,0.0,0.0,0.0,0.0,0.0,3,3,0,0,0


### 2-2-3 Liquidity

In [16]:
# L1. 借款依存度(DE%)超過100%
# df[df['借款依存度%'] > 100]
L1 = []
for l in df['借款依存度%']:
    if l > 100:
        L1.append(1)
    else:
        L1.append(0)
df['L1'] = L1

In [17]:
# L3. CFO為負，不足資金 < -0.25億
# df[df['CFO'] < -0.25]
L3 = []
for l in df['CFO']:
    if l < 0.25:
        L3.append(1)
    else:
        L3.append(0)
df['L3'] = L3

### 把TCRI等級量化

In [18]:
df["score"] = 0.0
df.loc[df['TCRI'] ==  0.0, 'score'] = 110.0
df.loc[df['TCRI'] ==  9.0, 'score'] = 110.0
df.loc[df['TCRI'] ==  8.0, 'score'] = 215.0
df.loc[df['TCRI'] ==  7.0, 'score'] = 300.0
df.loc[df['TCRI'] ==  6.0, 'score'] = 385.0
df.loc[df['TCRI'] ==  5.0, 'score'] = 470.0
df.loc[df['TCRI'] ==  4.0, 'score'] = 555.0
df.loc[df['TCRI'] ==  3.0, 'score'] = 640.0
df.loc[df['TCRI'] ==  2.0, 'score'] = 725.0
df.loc[df['TCRI'] ==  1.0, 'score'] = 810.0

### 把初步等級量化

In [19]:
# df["score_first"] = 0.0
# df.loc[df['初步等級'] ==  0.0, 'score_first'] = -50.0
# df.loc[df['初步等級'] ==  9.0, 'score_first'] = 50.0
# df.loc[df['初步等級'] ==  8.0, 'score_first'] = 165.0
# df.loc[df['初步等級'] ==  7.0, 'score_first'] = 300.0
# df.loc[df['初步等級'] ==  6.0, 'score_first'] = 400.0
# df.loc[df['初步等級'] ==  5.0, 'score_first'] = 500.0
# df.loc[df['初步等級'] ==  4.0, 'score_first'] = 600.0
# df.loc[df['初步等級'] ==  3.0, 'score_first'] = 700.0
# df.loc[df['初步等級'] ==  2.0, 'score_first'] = 800.0
# df.loc[df['初步等級'] ==  1.0, 'score_first'] = 900.0

In [20]:
df.head()

Unnamed: 0,公司,年月,TCRI,總分,CFO,CFO不一致,利息支出率,資產報酬%,營業利益%,借款息%,...,c_3,c_4,初步等級,門檻等級_規模,A1,A3,P,L1,L3,score
0,1101 台泥,2008/12/31,3.0,622.0,80.43,0,3.51,2.93,6.64,3.72,...,0.0,0.0,3,3,0,0,0,0,0,640.0
1,1201 味全,2008/12/31,5.0,621.0,10.75,0,0.54,4.62,4.04,3.27,...,0.0,0.0,3,3,0,0,0,0,0,470.0
2,1210 大成,2008/12/31,4.0,719.0,23.25,0,0.42,5.93,1.96,4.42,...,0.0,0.0,2,2,0,0,0,0,0,555.0
3,1215 卜蜂,2008/12/31,4.0,582.0,1.77,0,0.27,5.49,2.37,3.73,...,0.0,0.0,4,4,0,0,0,0,0,555.0
4,1216 統一,2008/12/31,4.0,618.0,178.78,0,1.38,4.02,4.95,3.56,...,0.0,0.0,3,3,0,0,0,1,0,555.0


### 去掉跟regression無關的欄位

In [21]:
df = df.drop(['總分','CFO不一致','TSE產業別','初步等級','門檻等級_規模', 'A1','A3','P','L1','L3'], axis = 1)
print(df.shape)
df.head()

(2258, 18)


Unnamed: 0,公司,年月,TCRI,CFO,利息支出率,資產報酬%,營業利益%,借款息%,平均收款天增減,平均售貨天增減,借款依存度%,營收,c_0,c_1,c_2,c_3,c_4,score
0,1101 台泥,2008/12/31,3.0,80.43,3.51,2.93,6.64,3.72,-2.0,-2.0,88.0,785.0,0.0,0.0,0.0,0.0,0.0,640.0
1,1201 味全,2008/12/31,5.0,10.75,0.54,4.62,4.04,3.27,-2.0,1.0,63.0,202.0,0.0,0.0,0.0,0.0,0.0,470.0
2,1210 大成,2008/12/31,4.0,23.25,0.42,5.93,1.96,4.42,-5.0,-9.0,43.0,712.0,0.0,0.0,0.0,0.0,0.0,555.0
3,1215 卜蜂,2008/12/31,4.0,1.77,0.27,5.49,2.37,3.73,0.0,-1.0,38.0,154.0,0.0,0.0,0.0,0.0,0.0,555.0
4,1216 統一,2008/12/31,4.0,178.78,1.38,4.02,4.95,3.56,-1.0,1.0,125.0,2997.0,0.0,0.0,0.0,0.0,0.0,555.0


# 3 訓練前的準備

## 3-1 Resample imbalanced labels
https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets

In [22]:
df['TCRI'].value_counts()

4.0    509
5.0    440
6.0    398
7.0    270
3.0    206
8.0    168
9.0    102
0.0     80
2.0     45
1.0     40
Name: TCRI, dtype: int64

In [23]:
# df['TCRI'].value_counts().plot(kind='bar')

In [24]:
# Class count
count_TCRI_4 = 2*len(df[df['TCRI'] == 4])


# Divide by class
df_TCRI_0 = df[df['TCRI'] == 0.0]
df_TCRI_1 = df[df['TCRI'] == 1]
df_TCRI_2 = df[df['TCRI'] == 2]
df_TCRI_3 = df[df['TCRI'] == 3]
df_TCRI_4 = df[df['TCRI'] == 4]
df_TCRI_5 = df[df['TCRI'] == 5]
df_TCRI_6 = df[df['TCRI'] == 6]
df_TCRI_7 = df[df['TCRI'] == 7]
df_TCRI_8 = df[df['TCRI'] == 8]
df_TCRI_9 = df[df['TCRI'] == 9]

In [25]:
# 把每個類別都複製到跟4類別一樣多

df_TCRI_0_over = df_TCRI_0.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_1_over = df_TCRI_1.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_2_over = df_TCRI_2.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_3_over = df_TCRI_3.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_4_over = df_TCRI_4.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_5_over = df_TCRI_5.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_6_over = df_TCRI_6.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_7_over = df_TCRI_7.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_8_over = df_TCRI_8.sample(n=count_TCRI_4, replace=True, random_state=2018, )
df_TCRI_9_over = df_TCRI_9.sample(n=count_TCRI_4, replace=True, random_state=2018, )

In [26]:
# 新的複製的data 合併
df_resample = pd.concat([df_TCRI_0_over, df_TCRI_1_over, df_TCRI_2_over, df_TCRI_3_over, df_TCRI_4_over, df_TCRI_5_over, df_TCRI_6_over, df_TCRI_7_over, df_TCRI_8_over, df_TCRI_9_over])
df_resample['TCRI'].value_counts()

9.0    1018
8.0    1018
7.0    1018
6.0    1018
5.0    1018
4.0    1018
3.0    1018
2.0    1018
1.0    1018
0.0    1018
Name: TCRI, dtype: int64

In [27]:
# df_resample['TCRI'].value_counts().plot(kind='bar')

# 4 訓練LinearRegression模型 求coef_

In [28]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, Dropout, Activation, Flatten, InputLayer
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### 沒resample 沒標準化 有c_0 c_1 c_2 c_3 c_4

In [29]:
Y = df['score'].values
X = df.drop(['score', 'TCRI', '公司', '年月'], axis = 1).values

print(X.shape)
print(Y.shape)

(2258, 14)
(2258,)


In [30]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 1.72351194e-02 -7.65526790e+00  8.55865506e+00 -1.70242655e-01
 -8.14389690e+00  3.04416790e-01  5.59061301e-03 -1.14394378e-01
  3.79412068e-02 -1.16454615e+00 -8.06993455e+00 -3.37822130e-01
 -2.37317717e+01 -2.53353433e+00]
CPU times: user 9.91 ms, sys: 556 µs, total: 10.5 ms
Wall time: 23.1 ms


### 沒resample 有標準化 有c_0 c_1 c_2 c_3 c_4

In [31]:
Y = df['score'].values
X = df.drop(['score', 'TCRI', '公司', '年月'], axis = 1).values

# 標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
X = minmax_scale.fit_transform(X)

print(X.shape)
print(Y.shape)

(2258, 14)
(2258,)


In [32]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 103.18717673 -581.03483396 1469.60666042 -186.42762448 -182.9119244
  138.81405612   96.30389975 -505.16557246  460.49242722 -241.1888871
 -266.3288965   -36.91369668 -141.6167159   -85.78351192]
CPU times: user 4.5 ms, sys: 2.28 ms, total: 6.78 ms
Wall time: 1.08 ms


### 有resample 沒標準化 有c_0 c_1 c_2 c_3 c_4

In [33]:
Y = df_resample['score'].values
X = df_resample.drop(['score', 'TCRI', '公司', '年月'], axis = 1).values

print(X.shape)
print(Y.shape)

(10180, 14)
(10180,)


In [34]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[-1.21659837e-02 -7.41564050e+00  1.02015829e+01 -3.26312554e-01
 -1.77589083e+01  2.84423453e-01  9.50423180e-03 -7.50467428e-02
  4.96018258e-02 -4.42785900e-01 -1.72896130e+01 -1.82194250e+00
 -2.38686988e+01 -3.69018683e-01]
CPU times: user 13 ms, sys: 1.18 ms, total: 14.2 ms
Wall time: 2.31 ms


### 有resample 有標準化 有c_0 c_1 c_2 c_3 c_4

In [35]:
Y = df_resample['score'].values
X = df_resample.drop(['score', 'TCRI', '公司', '年月'], axis = 1).values

# 標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
X = minmax_scale.fit_transform(X)

print(X.shape)
print(Y.shape)

(10180, 14)
(10180,)


In [36]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ -72.6474686  -562.84711427 1751.71380385 -334.14405554 -398.86508092
  129.69709441  163.71989694 -331.40641629  602.01735966  -91.70528676
 -570.60234046 -199.08297012 -142.43381294  -12.49468708]
CPU times: user 11.6 ms, sys: 0 ns, total: 11.6 ms
Wall time: 1.88 ms


### 沒resample 沒標準化 沒c_0 c_1 c_2 c_3 c_4

In [37]:
Y = df['score'].values
X = df.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).values

print(X.shape)
print(Y.shape)

(2258, 9)
(2258,)


In [38]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 2.13400183e-02 -7.82950523e+00  8.63288564e+00 -1.84980946e-01
 -7.94946710e+00  3.05559503e-01  5.66512382e-03 -1.16388835e-01
  3.79122196e-02]
CPU times: user 5.57 ms, sys: 376 µs, total: 5.95 ms
Wall time: 955 µs


### 沒resample 有標準化 沒c_0 c_1 c_2 c_3 c_4

In [39]:
Y = df['score'].values
X = df.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).values

# 標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
X = minmax_scale.fit_transform(X)

print(X.shape)
print(Y.shape)

(2258, 9)
(2258,)


In [40]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ 127.76332965 -594.25944662 1482.35279242 -202.56708494 -178.54503109
  139.33513337   97.58742294 -513.97309344  460.14060975]
CPU times: user 6.82 ms, sys: 0 ns, total: 6.82 ms
Wall time: 1.1 ms


### 有resample 沒標準化 沒c_0 c_1 c_2 c_3 c_4

In [41]:
Y = df_resample['score'].values
X = df_resample.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).values

print(X.shape)
print(Y.shape)

(10180, 9)
(10180,)


In [42]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[-5.87243467e-03 -7.60302292e+00  1.04254856e+01 -3.48962447e-01
 -1.80289482e+01  3.07994207e-01  9.49480161e-03 -7.59539276e-02
  4.91561183e-02]
CPU times: user 8.87 ms, sys: 0 ns, total: 8.87 ms
Wall time: 1.43 ms


### 有resample 有標準化 沒c_0 c_1 c_2 c_3 c_4

In [43]:
Y = df_resample['score'].values
X = df_resample.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).values

# 標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
X = minmax_scale.fit_transform(X)

print(X.shape)
print(Y.shape)

(10180, 9)
(10180,)


In [44]:
%%time

from sklearn.linear_model import LinearRegression

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X, Y)

print(model_LinearRegression.coef_)

[ -35.06642147 -577.06943928 1790.16013144 -357.3375459  -404.93017736
  140.44535838  163.5574525  -335.41254421  596.60780771]
CPU times: user 8.66 ms, sys: 64 µs, total: 8.73 ms
Wall time: 1.4 ms


In [45]:
df.columns.values

array(['公司', '年月', 'TCRI', 'CFO', '利息支出率', '資產報酬%', '營業利益%', '借款息%',
       '平均收款天增減', '平均售貨天增減', '借款依存度%', '營收', 'c_0', 'c_1', 'c_2', 'c_3',
       'c_4', 'score'], dtype=object)

In [46]:
df.drop(['score', 'TCRI', '公司', '年月'], axis = 1).columns.values

array(['CFO', '利息支出率', '資產報酬%', '營業利益%', '借款息%', '平均收款天增減', '平均售貨天增減',
       '借款依存度%', '營收', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4'], dtype=object)

In [47]:
df.drop(['score', 'TCRI', '公司', '年月', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4' ], axis = 1).columns.values

array(['CFO', '利息支出率', '資產報酬%', '營業利益%', '借款息%', '平均收款天增減', '平均售貨天增減',
       '借款依存度%', '營收'], dtype=object)