In [235]:
import pandas as pd
import numpy as np

In [56]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(train.shape)
print(test.shape)

(371, 7)
(529, 6)


In [11]:
# train['train'] = 1
# test['train'] = 0
# data = pd.concat([train, test], axis=0, ignore_index=True)
# print(data.shape)
# display(data.head())

In [21]:
train.color.unique()

array(['clear', 'green', 'black', 'white', 'blue', 'blood'], dtype=object)

In [22]:
train.type.unique()

array(['Ghoul', 'Goblin', 'Ghost'], dtype=object)

In [57]:
type_dict = {'Ghoul':2, 'Goblin':0, 'Ghost':1}
train.loc[:, 'type'] = train['type'].map(type_dict)

In [58]:
train.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,2
1,1,0.57556,0.425868,0.531401,0.439899,green,0
2,2,0.467875,0.35433,0.811616,0.791225,black,2
3,4,0.776652,0.508723,0.636766,0.884464,black,2
4,5,0.566117,0.875862,0.418594,0.636438,green,1


In [59]:
train.type.value_counts()

2    129
0    125
1    117
Name: type, dtype: int64

In [26]:
test.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color
0,3,0.471774,0.387937,0.706087,0.698537,black
1,6,0.427332,0.645024,0.565558,0.451462,white
2,9,0.549602,0.491931,0.660387,0.449809,black
3,10,0.638095,0.682867,0.471409,0.356924,white
4,13,0.361762,0.583997,0.377256,0.276364,black


In [60]:
one_hot_train = pd.get_dummies(train.color, dtype='float')
one_hot_test = pd.get_dummies(test.color, dtype='float')
print(one_hot_train.shape)
print(one_hot_test.shape)

(371, 6)
(529, 6)


In [61]:
train_df = pd.concat([train, one_hot_train], axis=1)
test_df = pd.concat([test, one_hot_test], axis=1)
print(train_df.shape)
print(test_df.shape)

(371, 13)
(529, 12)


In [62]:
train_data = train_df.copy()
test_data = test_df.copy()

In [63]:
train_data = train_data.drop(['id', 'color'], axis=1)
test_data = test_data.drop(['id', 'color'], axis=1)
print(train_data.shape)
print(test_data.shape)

(371, 11)
(529, 10)


In [96]:
from sklearn.model_selection import train_test_split
label = train_data['type'].values
train = train_data.drop(['type'], axis=1)
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3)

print(train_X.shape)
print(valid_X.shape)
print(train_y.shape)
print(valid_y.shape)

(259, 10)
(112, 10)
(259,)
(112,)


# 分类模型
***

### Logistic model

In [204]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

seed = 10
lr = LogisticRegression(random_state=seed, 
                        C=30, 
                        class_weight='balanced', 
                        solver='newton-cg', 
                        multi_class='multinomial')  
lr = lr.fit(train_X, train_y)
lr_pred = lr.predict(valid_X)

print('confusion_matrix--:')
print(confusion_matrix(valid_y, lr_pred))
print('Precision---------:', precision_score(valid_y, lr_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, lr_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, lr_pred, average='micro'))

confusion_matrix--:
[[24  5  6]
 [ 5 33  0]
 [ 8  2 29]]
Precision---------: 0.7678571428571429
Recall------------: 0.7678571428571429
F1_score----------: 0.7678571428571429


### KNN model

In [203]:
from sklearn.neighbors import KNeighborsClassifier

train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(train_X, train_y)
knn = lr.fit(train_X, train_y)
knn_pred = knn.predict(valid_X)

print('confusion_matrix--:')
print(confusion_matrix(valid_y, knn_pred))
print('Precision---------:', precision_score(valid_y, knn_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, knn_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, knn_pred, average='micro'))

confusion_matrix--:
[[24  5  6]
 [ 5 33  0]
 [ 8  2 29]]
Precision---------: 0.7678571428571429
Recall------------: 0.7678571428571429
F1_score----------: 0.7678571428571429


### SVM model

In [202]:
from sklearn.svm import SVC

train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
svc = SVC(C=3, kernel='rbf', random_state=10)
svc = svc.fit(train_X, train_y)
svc_pred = svc.predict(valid_X)

print('confusion_matrix--:')
print(confusion_matrix(valid_y, svc_pred))
print('Precision---------:', precision_score(valid_y, svc_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, svc_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, svc_pred, average='micro'))

confusion_matrix--:
[[24  5  6]
 [ 7 31  0]
 [ 8  3 28]]
Precision---------: 0.7410714285714286
Recall------------: 0.7410714285714286
F1_score----------: 0.7410714285714286


### 随机森林

In [201]:
from sklearn.ensemble import RandomForestClassifier as RFC
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
rfc = RFC(n_estimators=150, max_depth=4, random_state=10)
rfc = rfc.fit(train_X, train_y)
rfc_pred = rfc.predict(valid_X)

print('confusion_matrix--:')
print(confusion_matrix(valid_y, rfc_pred))
print('Precision---------:', precision_score(valid_y, rfc_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, rfc_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, rfc_pred, average='micro'))

confusion_matrix--:
[[25  5  5]
 [ 7 31  0]
 [ 8  2 29]]
Precision---------: 0.7589285714285714
Recall------------: 0.7589285714285714
F1_score----------: 0.7589285714285714


### LightGBM model

In [227]:
import lightgbm as lgb

params = {'boosting_type':'gbdt',
          'num_leaves': 60, 
          'min_data_in_leaf': 30,
          'objective': 'multiclass',
          'num_class': 3,
          'max_depth': -1,
          'learning_rate': 0.06,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.4,	
          "lambda_l2": 0.5,
          "verbosity": -1,
          'metric': 'multi_logloss',
          "random_state": 2022,	
          }
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
tr_data = lgb.Dataset(train_X, label=train_y)
val_data = lgb.Dataset(valid_X, label=valid_y)
num_round = 1000
lgb = lgb.train(params, 
                tr_data,
                num_round,
                valid_sets=[tr_data, val_data],
                verbose_eval=100,
                early_stopping_rounds=200)
y_pred = lgb.predict(valid_X, num_iteration=lgb.best_iteration)
lgb_pred = [list(x).index(max(x)) for x in y_pred]

print('confusion_matrix--:')
print(confusion_matrix(valid_y, lgb_pred))
print('Precision---------:', precision_score(valid_y, lgb_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, lgb_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, lgb_pred, average='micro'))

Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.388456	valid_1's multi_logloss: 0.598804
[200]	training's multi_logloss: 0.316302	valid_1's multi_logloss: 0.614812
Early stopping, best iteration is:
[82]	training's multi_logloss: 0.408648	valid_1's multi_logloss: 0.589863
confusion_matrix--:
[[26  5  4]
 [ 6 32  0]
 [11  2 26]]
Precision---------: 0.75
Recall------------: 0.75
F1_score----------: 0.75




### NN model

In [200]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers

seed = 10
print('seed----------------:', seed)
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)

X_train_NN = (train_X.values).astype('float32') # all pixel values
y_train_NN = train_y.astype('int32')
X_valid_NN = (valid_X.values).astype('float32')
y_valid_NN = valid_y.astype('int32')

one_hot_train_y=to_categorical(train_y)
one_hot_valid_y=to_categorical(valid_y)

#-----------------------------------------构建为网络
np.random.seed(seed)
tf.random.set_seed(seed)

input_shape = X_train_NN.shape[1]
b_size = 500
max_epochs = 200
model = models.Sequential()
model.add(layers.Dense(64,activation='relu',input_shape = (10,)))
# model.add(layers.Dense(128,activation='relu'))
# model.add(layers.Dense(128,activation='relu'))
model.add(layers.Dense(64,activation='relu'))
model.add(layers.Dense(3,activation='softmax'))
print(model.summary())
#------------------------------------------训练模型
model.compile(optimizer='rmsprop', 
              loss="categorical_crossentropy", 
              metrics=['accuracy'])

h = model.fit(X_train_NN, 
              one_hot_train_y, 
              batch_size=b_size, 
              epochs=max_epochs, 
              shuffle=True, 
              verbose=1)
nn_pred = model.predict(X_valid_NN)

NN_pred = []
for v in nn_pred:
    index = np.argmax(v)
    NN_pred.append(index)

print('confusion_matrix--:')
print(confusion_matrix(NN_pred, y_valid_NN))
print('Precision---------:', precision_score(y_valid_NN, NN_pred, average='micro'))
print('Recall------------:', recall_score(y_valid_NN, NN_pred, average='micro'))
print('F1_score----------:', f1_score(y_valid_NN, NN_pred, average='micro'))

seed----------------: 10
Model: "sequential_103"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_344 (Dense)           (None, 64)                704       
                                                                 
 dense_345 (Dense)           (None, 64)                4160      
                                                                 
 dense_346 (Dense)           (None, 3)                 195       
                                                                 
Total params: 5,059
Trainable params: 5,059
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 2

# 自动化寻找最佳模型
***

In [247]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers


seed = 10
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=seed)

def LR():
    seed = 10
    lr = LogisticRegression(random_state=seed, 
                            C=30, 
                            class_weight='balanced', 
                            solver='newton-cg', 
                            multi_class='multinomial')  
    lr = lr.fit(train_X, train_y)
    lr_pred = lr.predict(valid_X)
    
    return [precision_score(valid_y, lr_pred, average='micro'),
            recall_score(valid_y, lr_pred, average='micro'), 
            f1_score(valid_y, lr_pred, average='micro')]

def KNN():
    knn = KNeighborsClassifier(n_neighbors=3)
    knn = knn.fit(train_X, train_y)
    knn = lr.fit(train_X, train_y)
    knn_pred = knn.predict(valid_X)

    return [precision_score(valid_y, knn_pred, average='micro'),
            recall_score(valid_y, knn_pred, average='micro'), 
            f1_score(valid_y, knn_pred, average='micro')]
    
def SVM():
    svc = SVC(C=3, kernel='rbf', random_state=seed)
    svc = svc.fit(train_X, train_y)
    svc_pred = svc.predict(valid_X)

    return [precision_score(valid_y, svc_pred, average='micro'),
            recall_score(valid_y, svc_pred, average='micro'), 
            f1_score(valid_y, svc_pred, average='micro')]

def rfc():
    rfc = RFC(n_estimators=150, max_depth=4, random_state=seed)
    rfc = rfc.fit(train_X, train_y)
    rfc_pred = rfc.predict(valid_X)

    return [precision_score(valid_y, rfc_pred, average='micro'),
            recall_score(valid_y, rfc_pred, average='micro'), 
            f1_score(valid_y, rfc_pred, average='micro')]
    
def lgb_model():
    params = {'boosting_type':'gbdt',
          'num_leaves': 60, 
          'min_data_in_leaf': 30,
          'objective': 'multiclass',
          'num_class': 3,
          'max_depth': -1,
          'learning_rate': 0.06,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.4,	
          "lambda_l2": 0.5,
          "verbosity": -1,				
          'metric': 'multi_logloss',	
          "random_state": 2022,	
          }
    tr_data = lgb.Dataset(train_X, label=train_y)
    val_data = lgb.Dataset(valid_X, label=valid_y)
    num_round = 1000
    lgb_model = lgb.train(params, 
                    tr_data,
                    num_round,
                    valid_sets=[tr_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=200)
    y_pred = lgb_model.predict(valid_X, num_iteration=lgb_model.best_iteration)
    lgb_pred = [list(x).index(max(x)) for x in y_pred]

    return [precision_score(valid_y, lgb_pred, average='micro'),
            recall_score(valid_y, lgb_pred, average='micro'), 
            f1_score(valid_y, lgb_pred, average='micro')]
    
    
def NN():
    X_train_NN = (train_X.values).astype('float32') # all pixel values
    y_train_NN = train_y.astype('int32')
    X_valid_NN = (valid_X.values).astype('float32')
    y_valid_NN = valid_y.astype('int32')

    one_hot_train_y=to_categorical(train_y)
    one_hot_valid_y=to_categorical(valid_y)

    #-----------------------------------------构建为网络
    np.random.seed(seed)
    tf.random.set_seed(seed)

    input_shape = X_train_NN.shape[1]
    b_size = 500
    max_epochs = 200
    
    model = models.Sequential()
    model.add(layers.Dense(64,activation='relu',input_shape = (10,)))
    model.add(layers.Dense(64,activation='relu'))
    model.add(layers.Dense(3,activation='softmax'))
    print(model.summary())
    #------------------------------------------训练模型
    model.compile(optimizer='rmsprop', 
                  loss="categorical_crossentropy", 
                  metrics=['accuracy'])

    h = model.fit(X_train_NN, 
                  one_hot_train_y, 
                  batch_size=b_size, 
                  epochs=max_epochs, 
                  shuffle=True, 
                  verbose=1)
    nn_pred = model.predict(X_valid_NN)

    NN_pred = []
    for v in nn_pred:
        index = np.argmax(v)
        NN_pred.append(index)

    return [precision_score(valid_y, NN_pred, average='micro'),
            recall_score(valid_y, NN_pred, average='micro'), 
            f1_score(valid_y, NN_pred, average='micro')]


In [249]:
model_dict = {'0':'LR()', 
              '1':'KNN()', 
              '2':'SVM()', 
              '3':'rfc()', 
              '4':'lgb_model()', 
              '5':'NN()'}
MODELS = [LR(), KNN(), SVM(), rfc(), lgb_model(), NN()]
Pre = []
Recall = []
F1_score = []
for MODEL in MODELS:
    clf = MODEL
    Pre.append(clf[0])
    Recall.append(clf[1])
    F1_score.append(clf[2])
Pre_index = np.argmax(Pre)
Recall_index = np.argmax(Recall)
F1_score_index = np.argmax(F1_score)

print('Pre最高的模型是---------：', model_dict[str(Pre_index)])
print('Recall最高的模型是---------：', model_dict[str(Recall_index)])
print('F1_score最高的模型是-------：', model_dict[str(F1_score_index)])

Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.388456	valid_1's multi_logloss: 0.598804
[200]	training's multi_logloss: 0.316302	valid_1's multi_logloss: 0.614812
Early stopping, best iteration is:
[82]	training's multi_logloss: 0.408648	valid_1's multi_logloss: 0.589863
Model: "sequential_108"




_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_359 (Dense)           (None, 64)                704       
                                                                 
 dense_360 (Dense)           (None, 64)                4160      
                                                                 
 dense_361 (Dense)           (None, 3)                 195       
                                                                 
Total params: 5,059
Trainable params: 5,059
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoc

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

seed = 10
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=seed)
mlp = MLPClassifier(hidden_layer_sizes=(128, 30), 
                    activation='relu', 
                    solver='adam', 
                    max_iter=200, 
                    random_state=seed)

X_train_NN = (train_X.values).astype('float32')
y_train_NN = train_y.astype('int32')
X_valid_NN = (valid_X.values).astype('float32')
y_valid_NN = valid_y.astype('int32')

mlp = mlp.fit(X_train_NN, y_train_NN)
y_pred = mlp.predict(X_valid_NN) 

print('confusion_matrix--:')
print(confusion_matrix(y_pred, y_valid_NN))
print('Precision---------:', precision_score(y_valid_NN, y_pred, average='micro'))
print('Recall------------:', recall_score(y_valid_NN, y_pred, average='micro'))
print('F1_score----------:', f1_score(y_valid_NN, y_pred, average='micro'))