In [None]:
# 本文件用于把影响Release Note结构的潜在因素（代码行数、文件数量、版本间commit数量、贡献者数量），与结构的形式进行逻辑回归
# 原理参考：https://zhuanlan.zhihu.com/p/74874291
# sklearn接口参考：https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regression#sklearn.linear_model.LogisticRegression
# http://www.statsmodels.org/stable/index.html

In [73]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import math
import seaborn as sns
import statsmodels.api as sma

In [2]:
path = "../Domain_W.csv"
data = csv.reader(open(path))
assert(data)

content_type = ["feature", "fix", "enhancement", "api change", "documentation", "dependency", "test"]
ct2index = {"feature": 18, "fix": 19, "enhancement": 20, "api change": 21, "documentation": 22, "dependency": 23, "test": 24}
project_type_index = 3
factor_type = 

content = []
for i in data:
    content.append(i)

head = content[0]
content = content[1:]

In [58]:
def get_xy(content, xfunc, yfunc):
    x = []
    y = []
    for row in content:
        x.append(xfunc(row)) 
        y.append(yfunc(row))
    return x, y

def Xfunc(row):
    return [math.log(int(row[4])), math.log(int(row[5])), math.log(int(row[6]))]

def Yfunc(row):
    if '列表' in row[11]:
        return 1
    return 0

In [76]:
X, Y = get_xy(content, Xfunc, Yfunc) # x: code line, y: if 结构为模块

max_min = []
for i in range(len(X[0])): # 自变量统一到[0, 1]
    maxi = -1
    mini = 1000000000
    for factor in X:
        maxi = max(maxi, factor[i])
        mini = min(mini, factor[i])
    max_min.append((maxi, mini))
    for index, factor in enumerate(X):
        X[index][i] = (factor[i] - mini) / (maxi - mini)
X = np.array(X)
Y = np.array(Y)
X = sma.add_constant(X)
#sns.scatterplot([x[0] for x in X], Y)

log_reg = LogisticRegression(
    penalty = "l2", # L2正则化
    C = 1.0, # 调整损失函数和L2损失的比重，C越大L2损失的重要程度越小（C乘在损失函数前）
    fit_intercept = True, 
    solver = 'liblinear', 
    max_iter = 100, 
    verbose = 1) # 不知道干什么的
log_reg.fit(X, Y)

print(log_reg.coef_)
print(log_reg.classes_)
score = log_reg.score(X, Y)
print(score)

# 还是直接用statsmodels吧，用sklearn没有最终的参数分析
logit_mod = sma.Logit(Y, X)
res = logit_mod.fit()
res.summary()

[LibLinear][[ 0.1344483  -1.07107674 -0.97020928 -0.58084945]]
[0 1]
0.7272727272727273
Optimization terminated successfully.
         Current function value: 0.485355
         Iterations 6


0,1,2,3
Dep. Variable:,y,No. Observations:,55.0
Model:,Logit,Df Residuals:,51.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 19 Jan 2021",Pseudo R-squ.:,0.1717
Time:,16:34:09,Log-Likelihood:,-26.695
converged:,True,LL-Null:,-32.227
,,LLR p-value:,0.01138

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.7892,1.126,1.590,0.112,-0.417,3.995
x1,-3.5319,3.376,-1.046,0.295,-10.148,3.084
x2,-1.1877,3.413,-0.348,0.728,-7.878,5.502
x3,-1.3791,2.303,-0.599,0.549,-5.893,3.134


In [50]:
for i, j in zip(X, Y):
    print(i, j, log_reg.predict([i]))

[106671] 0 [0]
[2668] 1 [0]
[230196] 0 [0]
[325084] 0 [0]
[19518] 1 [0]
[7859] 1 [0]
[352186] 0 [0]
[1315489] 0 [0]
[427476] 0 [0]
[22531] 0 [0]
[13067] 0 [0]
[207570] 0 [0]
[161099] 0 [0]
[390276] 0 [0]
[40743] 0 [0]
[18424] 1 [0]
[162804] 0 [0]
[11727] 0 [0]
[440598] 0 [0]
[214962] 0 [0]
[295028] 0 [0]
[111021] 0 [0]
[738901] 0 [0]
[1574] 0 [0]
[6182] 1 [0]
[6080619] 0 [0]
[2830565] 0 [0]
[5074] 0 [0]
[10591] 1 [0]
[9643] 0 [0]
[14903] 0 [0]
[124931] 1 [0]
[79216] 0 [0]
[673200] 0 [0]
[35372] 1 [0]
[62190] 0 [0]
[1265444] 0 [0]
[198382] 1 [0]
[955042] 0 [0]
[122500] 1 [0]
[142660] 0 [0]
[145514] 0 [0]
[475348] 0 [0]
[73548] 0 [0]
[421859] 0 [0]
[3352] 1 [0]
[129614] 1 [0]
[41800] 1 [0]
[28112] 1 [0]
[22890] 0 [0]
[159946] 0 [0]
[20924] 0 [0]
[55103] 0 [0]
[22417] 1 [0]
[58906] 0 [0]
