In [34]:
from pandas import read_csv


train_dataset = read_csv('data/train_data.csv')

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score,f1_score,accuracy_score
import lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation, record_evaluation

In [36]:
train_dataset.text = train_dataset.text.str.replace('\n', ' ')

In [37]:
train_dataset.text = train_dataset.text.str.replace('\n', ' ')
class_0 = train_dataset[train_dataset['label'] == 0]
class_1 = train_dataset[train_dataset['label'] == 1]

# 确定两个类别的样本数量
num_class_0 = len(class_0)
num_class_1 = len(class_1)

# 计算较小的样本数量
min_num = min(num_class_0, num_class_1)

# 对样本较多的类别进行随机抽样
if num_class_0 > min_num:
    class_0 = class_0.sample(n=min_num, random_state=42)  # random_state 确保结果的可复现性
elif num_class_1 > min_num:
    class_1 = class_1.sample(n=min_num, random_state=42)

# 合并数据集
balanced_train_dataset = pd.concat([class_0, class_1])

# 如果需要，可以打乱数据集的顺序
balanced_train_dataset = balanced_train_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
train_dataset.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,"Hey there! So, you wanna know how people can ...",1
1,1,Do you think the seagoing cowboys is a good jo...,0
2,2,The position of the principal is good because ...,0
3,3,the aurthor suggestion was a very convensing a...,0
4,4,Many people believe that the 'Face on Mars' wa...,0


In [38]:
balanced_train_dataset.label.value_counts()

label
1    17500
0    17500
Name: count, dtype: int64

In [39]:
pipe = make_pipeline(TfidfVectorizer(
    min_df=5,
    max_df=0.8,
    # max_features=10000,
    ngram_range=(3,5),
    ),
    LogisticRegression(max_iter=1200)
    # MultinomialNB()
)
param_grid = {
#     'multinomialnb__alpha': [0.1, 1, 10],
    'logisticregression__C': [100],
    # 'tfidfvectorizer__ngram_range': [(3, 5), (5, 5)],
    'tfidfvectorizer__norm': ['l2']
}
grid = GridSearchCV(pipe, param_grid, cv=5,verbose=3)
grid.fit(balanced_train_dataset['text'], balanced_train_dataset['label'])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END logisticregression__C=100, tfidfvectorizer__norm=l2;, score=0.994 total time= 2.0min
[CV 2/5] END logisticregression__C=100, tfidfvectorizer__norm=l2;, score=0.996 total time= 2.5min
[CV 3/5] END logisticregression__C=100, tfidfvectorizer__norm=l2;, score=0.996 total time= 2.0min
[CV 4/5] END logisticregression__C=100, tfidfvectorizer__norm=l2;, score=0.996 total time= 2.1min
[CV 5/5] END logisticregression__C=100, tfidfvectorizer__norm=l2;, score=0.994 total time= 2.2min


In [40]:
print(f'best cross-val-score: {grid.best_score_}')
print(f'best params:\n{grid.best_params_}')
best_model = grid.best_estimator_


best cross-val-score: 0.9950571428571428
best params:
{'logisticregression__C': 100, 'tfidfvectorizer__norm': 'l2'}


In [46]:
test = read_csv('/data/teset_data.csv', sep=',')
test['generated'] = grid.best_estimator_.predict(test['text'])
# test[["id", "generated"]].to_csv("submission.csv", index=False)
test.to_csv('submission.csv', index=False)
probability_predictions = grid.best_estimator_.predict_proba(test['text'])[:, 1]  # assuming the positive class is at index
f1 = f1_score(test['label'], test['generated'])
auc = roc_auc_score(test['label'], probability_predictions)
accuracy = accuracy_score(test['label'], test['generated'])

# Print out the best cross-validation score, best parameters, F1 score, and AUC
print(f'Best cross-validation score: {grid.best_score_}')
print(f'Best parameters:\n{grid.best_params_}')
print(f'F1 score: {f1}')
print(f'AUC value: {auc}')
print(f'accuracy score:{accuracy}')

Best cross-validation score: 0.9950571428571428
Best parameters:
{'logisticregression__C': 100, 'tfidfvectorizer__norm': 'l2'}
F1 score: 0.9038062283737024
AUC value: 0.967595501874219
accuracy score:0.9007142857142857


In [None]:
test = read_csv('../input/567testdata/Mistral7B_CME_v7_15_percent_corruption.csv', sep=',')
test['generated_new'] = grid.best_estimator_.predict(test['text'])
# test[["id", "generated"]].to_csv("submission.csv", index=False)
test.to_csv('submission.csv', index=False)
accuracy = accuracy_score(test['generated'], test['generated_new'])
print(f'Best accuracy score:{accuracy}')

# Print out the best cross-validation score, best parameters, F1 score, and AUC
print(f'Best cross-validation score: {grid.best_score_}')
print(f'Best parameters:\n{grid.best_params_}')
print(f'F1 score: {f1}')
print(f'AUC value: {auc}')

In [None]:
best_model = grid.best_estimator_
# New string to predict
new_string = '''Incorrect Evaluation: The method of evaluation might be incorrect. For example, there might be an issue with how the true labels or predictions are being handled or interpreted in the computation of the metrics.'''

# Preprocessing is automatically handled by the pipeline
prediction = grid.best_estimator_.predict([new_string])
res = "generated" if prediction[0] == 1 else"human written"
print(f"The predicted class of the new string is: {res}")

In [None]:
from joblib import dump

# Save the best estimator
best_model = grid.best_estimator_
dump(best_model, 'RegressionModel.joblib')