In [None]:
import pandas
import jieba
import re

# 数据集

In [None]:
# 训练集
train = pandas.read_csv("./train.csv")
# 剔除无用信息
train = train.drop(columns=["id", "userId", "timestamp", "like"]) 
def clean(content):
    content = re.sub(r'[^\w\s]', '', content)
    content = " ".join(list(jieba.cut(content)))
    return content
train["clean"] = train["comment"].apply(clean)
train.head(5)

In [None]:
# 推理集
predict = pandas.read_csv("./test.csv")
# 剔除无用信息
predict = predict.drop(columns=["id", "userId", "timestamp", "like"]) 
def clean(content):
    content = re.sub(r'[^\w\s]', '', content)
    content = " ".join(list(jieba.cut(content)))
    return content
predict["clean"] = predict["comment"].apply(clean)
predict.head(5)

# 训练前处理（划分训练集测试集）

In [None]:
# 使用sklearn对所有的词进行特征提取
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train['clean'].values)
y = train['rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# X_train, y_train = X, y

# 训练（多种不同训练模式）

In [None]:
# 使用随机森林进行训练
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# mean_squared_error 3.7357142857142858

In [None]:
# 使用支持向量机进行训练
from sklearn.svm import SVC
model = SVC(kernel="linear")
model.fit(X_train, y_train)
# mean_squared_error 2.6714285714285713

In [None]:
# 使用决策树进行训练
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
# mean_squared_error 3.8285714285714287

In [None]:
# 使用神经网络进行训练
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=100, random_state=42)
model.fit(X_train, y_train)
# mean_squared_error 2.407142857142857

In [None]:
# 使用朴素贝叶斯进行训练
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
# mean_squared_error 2.5642857142857145

# 验证推理结果

In [None]:
y_pred = model.predict(X_test)
print(model)
# 准确率
from sklearn.metrics import accuracy_score
print("accuracy_score", accuracy_score(y_test, y_pred))
# R2分数（0~1范围打分，越接近1越好）
from sklearn.metrics import r2_score
print("r2_score", r2_score(y_test, y_pred))
# 均方误差
from sklearn.metrics import mean_squared_error
print("mean_squared_error", mean_squared_error(y_test, y_pred))