In [55]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
import jieba

# 1、读取数据

In [104]:
df = pd.read_csv('./data/train.csv')

In [105]:
tdf = pd.read_csv('./data/dev_id.csv')

In [107]:
all_text = df['question1'].tolist() + df['question2'].tolist() + tdf['question1'].tolist() + tdf['question2'].tolist()

In [108]:
all_text = [' '.join(jieba.lcut(i)) for i in all_text]

# 2、数据向量化

In [109]:
tfidf_obj = TfidfVectorizer()

In [110]:
tfidf_obj.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [111]:
ques1_matrix = tfidf_obj.transform(all_text[:20000])
ques2_matrix = tfidf_obj.transform(all_text[20000:40000])
tques1_matrix = tfidf_obj.transform(all_text[40000:50000])
tques2_matrix = tfidf_obj.transform(all_text[50000:])

In [112]:
ques1_matrix

<20000x9128 sparse matrix of type '<class 'numpy.float64'>'
	with 87194 stored elements in Compressed Sparse Row format>

In [113]:
ques2_matrix

<20000x9128 sparse matrix of type '<class 'numpy.float64'>'
	with 90869 stored elements in Compressed Sparse Row format>

In [114]:
tques1_matrix

<10000x9128 sparse matrix of type '<class 'numpy.float64'>'
	with 45310 stored elements in Compressed Sparse Row format>

In [118]:
tques2_matrix

<10000x9128 sparse matrix of type '<class 'numpy.float64'>'
	with 47609 stored elements in Compressed Sparse Row format>

In [136]:
import numpy as np

In [138]:
feature_matrix = np.concatenate([ques1_matrix.toarray(),ques2_matrix.toarray()],axis=1)

In [140]:
feature_matrix.shape

(20000, 18256)

In [143]:
tfeature_matrix = np.concatenate([tques1_matrix.toarray(),tques2_matrix.toarray()],axis=1)

In [145]:
tfeature_matrix.shape

(10000, 18256)

# 3、特征导入模型

In [146]:
from sklearn.linear_model import LogisticRegression

In [147]:
train_x = feature_matrix[:20000]

In [148]:
train_y = df['label'][:20000].tolist()

In [149]:
test_x = tfeature_matrix[:10000]

In [150]:
lr = LogisticRegression()

# 3.1、模型训练

In [151]:
lr.fit(train_x,train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# 3.2、模型预测

In [155]:
pred_y = lr.predict(test_x)

# 4、模型评估

In [163]:
from sklearn.metrics import f1_score

In [164]:
c = {'label':pred_y}

In [165]:
rdf = pd.DataFrame(c, columns = ['label'])

In [167]:
rdf.to_csv('./data/output/result.csv', sep = ",")

# 验证测试集

In [154]:
f1_score(test_y,pred_y)

ValueError: Found input variables with inconsistent numbers of samples: [4000, 10000]

# 5、提高结果

In [78]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
rf = RandomForestClassifier()

In [80]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [81]:
pred_rf_y = rf.predict(test_x)

In [82]:
f1_score(test_y,pred_rf_y)

0.5866026154256739

# 5.1、参数调优

In [83]:
rf1 = RandomForestClassifier(max_leaf_nodes=3000)

In [84]:
rf1.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=3000,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [85]:
pred_rf_y = rf1.predict(test_x)

In [86]:
f1_score(test_y,pred_rf_y)

0.6081582200247219

In [89]:
subdf = pd.read_csv('./data/sample_submission.csv')

In [90]:
c = {'id':subdf['id'][:4000], 'label':pred_rf_y}

In [91]:
dfn = pd.DataFrame(c, columns = ['id', 'label'])

In [93]:
dfn.to_csv('./data/output/result.csv', sep = ',')

# 5.2、xgboost模型

时间比较久

In [33]:
import xgboost

In [34]:
xgb = xgboost.XGBClassifier(n_jobs=2)

In [35]:
xgb.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=2,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [36]:
pred_xgb_y = xgb.predict(test_x)

### 结果不太好，可能过拟合

后续对数据再进行一下改进

In [37]:
f1_score(test_y,pred_xgb_y)

0.44046434494195685

In [52]:
rf = RandomForestClassifier()

In [53]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [54]:
pred_rf_y = rf.predict(test_x)

In [55]:
f1_score(test_y,pred_rf_y)

0.5807150595882989

# 训练词向量

In [56]:
import gensim

  return f(*args, **kwds)


In [58]:
model = gensim.models.word2vec.Word2Vec()

In [None]:
model.train()