In [13]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

SEED = 4
data1 = pd.read_csv("../data_after_cleaning.csv")

X_without_noise = data1.drop(columns=['type'])
y_without_noise = data1['type']

X_train_without_noise, X_test_without_noise, y_train_without_noise, y_test_without_noise = train_test_split(X_without_noise, y_without_noise, test_size=0.4, random_state=SEED)



In [14]:
from sklearn.linear_model import Lasso
from SALib.sample import saltelli
from SALib.analyze import sobol
from SALib.test_functions import Ishigami
import math

# lasso 回归 模型

# 模型参数
ALPHA = 0.0125

# 学习模型
lasso = Lasso(alpha=ALPHA)
lasso.fit(X_train_without_noise, y_train_without_noise)

def classification_func(X):
    return lasso.predict(X)>=1/2

parms = {
    'num_vars' : 8,
    'names' : ['SiO', 'Na2O', 'K2O', 'CaO', 'Al2O3', 'Fe2O3', 'CuO', 'P2O5'],
    'bounds' : [[0,100]] * 8
}

param_values = saltelli.sample(parms, 2**10)
Y = np.zeros([param_values.shape[0]])

for i, X in enumerate(param_values):
    Y[i] = classification_func(np.array([X,]))


Si = sobol.analyze(parms, Y)

result = []

for i in range(8):
    result.append((parms['names'][i], Si['S1'][i]))

result.sort(key=lambda x: -abs(x[1]))
result
for key, value in result:
    print(f'{key} {value:.3f}')

K2O 0.137
Na2O 0.088
CaO 0.085
Al2O3 0.039
CuO 0.032
P2O5 -0.005
SiO 0.002
Fe2O3 0.000


In [21]:
import pandas as pd
import numpy as np

# 制作含有扰动的数据集

SEED = 4
np.random.seed(SEED)

# 噪声设置为 0.1 , x -> (x +- (0~0.1)*x)
NOISE = 0.1

data = pd.read_csv("../data_after_cleaning.csv")
data1 = data['type']
data2 = data.drop(columns=['type'])
data2 = data2.apply(lambda x: x * (1 + (np.random.randn() * 2 - 1) * NOISE))

pd.concat([data1, data2], axis=1).to_csv("../data_after_cleaning_with_noise.csv", index=False)

In [24]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

data1 = pd.read_csv("../data_after_cleaning.csv")
data2 = pd.read_csv("../data_after_cleaning_with_noise.csv")

X_without_noise = data1.drop(columns=['type'])
y_without_noise = data1['type']
X_with_noise = data2.drop(columns=['type'])
y_with_noise = data2['type']

X_train_without_noise, X_test_without_noise, y_train_without_noise, y_test_without_noise = train_test_split(X_without_noise, y_without_noise, test_size=0.4, random_state=SEED)
X_train_with_noise, X_test_with_noise, y_train_with_noise, y_test_with_noise = train_test_split(X_with_noise, y_with_noise, test_size=0.4, random_state=SEED)

def noise_test(pred_func, classification_func, model_name):
    cr1 = classification_report(y_test_without_noise, classification_func(X_test_without_noise), output_dict=True)
    cr2 = classification_report(y_test_with_noise, classification_func(X_test_with_noise), output_dict=True)
    # none_noise model_name; class0(未风化): precision recall f1-score; class1(已风化) precision recall f1-score
    print(f"{model_name}\t{cr1['0']['precision']:.2f}\t{cr1['0']['recall']:.2f}\t{cr1['0']['f1-score']:.2f}\t{cr1['1']['precision']:.2f}\t{cr1['1']['recall']:.2f}\t{cr1['1']['f1-score']:.2f}\t")
    print(f"{model_name}\t{cr2['0']['precision']:.2f}\t{cr2['0']['recall']:.2f}\t{cr2['0']['f1-score']:.2f}\t{cr2['1']['precision']:.2f}\t{cr2['1']['recall']:.2f}\t{cr2['1']['f1-score']:.2f}\t")
    

In [25]:
# XGBOOST 决策树模型

from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb=XGBClassifier(max_depth=2)
xgb.fit(X_train_without_noise,y_train_without_noise)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=2, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [47]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

std = StandardScaler()

pca = PCA(n_components=2)
data_train = X_train_without_noise
color = y_train_without_noise
data_after_pca = pca.fit_transform(data_train)
data_after_pca = pd.DataFrame(data_after_pca)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(data_after_pca, color)

def pca_predict(X):
    X = pca.transform(X)
    return knn.predict(pd.DataFrame(X))


In [48]:
noise_test(lasso.predict, classification_func, 'Lasso Regression')
noise_test(xgb.predict, xgb.predict, 'XGBoost')
noise_test(pca_predict, pca_predict, 'PCA+KNN')


Lasso Regression	1.00	1.00	1.00	1.00	1.00	1.00	
Lasso Regression	0.95	1.00	0.97	1.00	0.90	0.95	
XGBoost	1.00	1.00	1.00	1.00	1.00	1.00	
XGBoost	1.00	1.00	1.00	1.00	1.00	1.00	
PCA+KNN	1.00	0.94	0.97	0.91	1.00	0.95	
PCA+KNN	0.90	1.00	0.95	1.00	0.80	0.89	


In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
# from sklearn.preprocessing import StandardScaler,MinMaxScaler

# pca = PCA(n_components=5)
# data_train = data.drop(["type"], axis=1)
# color = data["type"]
# data_after_pca = pca.fit_transform(data_train)
# print(pca.explained_variance_ratio_)
# data_after_pca = pd.DataFrame(data_after_pca)
# print(data_after_pca)
