In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation, metrics
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV

from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, CustomJS, HoverTool
from bokeh.io import output_notebook, push_notebook
from bokeh.layouts import gridplot, widgetbox, layout
from bokeh.models.widgets import Select
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6, Spectral11
from bokeh.models.widgets import Select

from pipelines import *

%matplotlib inline



In [2]:
training_path = '/home/alvin/!Final_Project/training_with_tokens.xlsx'
testing_path = '/home/alvin/!Final_Project/testing_with_tokens.xlsx'

embedding_dim = 10
top_n_token = 10

print('Load data...')
df_train = load_data(training_path)

Load data...


In [3]:
df_train.head()

Unnamed: 0,class,tokens,sentence
0,2,"[合晟资产, 专注, 股票, 债券, 二级市场, 投资, 合格, 投资者, 资产, 管理, ...",合晟资产 专注 股票 债券 二级市场 投资 合格 投资者 资产 管理 企业 业务范围 资产 ...
1,2,"[中, 小微企业, 个体, 工商户, 农户, 贷款, 设立, 发生, 变化, UNKNOWN]",中 小微企业 个体 工商户 农户 贷款 设立 发生 变化 UNKNOWN
2,1,"[立足于, 商业地产, 商业地产, 开发, 销售, 运营, 全产业链, 一整套, 增值, 业...",立足于 商业地产 商业地产 开发 销售 运营 全产业链 一整套 增值 业务 覆盖 商业 定位...
3,2,"[工商管理部门, 核准, 经营范围, 投资, 咨询, 经济, 信息, 咨询, 企业管理, 咨...",工商管理部门 核准 经营范围 投资 咨询 经济 信息 咨询 企业管理 咨询 品牌 推广 策划...
4,2,"[中国, 境内, 港, 澳, 台, 保险代理, 销售, 研究, 能力, 专业化, 能力, 团...",中国 境内 港 澳 台 保险代理 销售 研究 能力 专业化 能力 团体 个人保险 受众 投保...


In [70]:
class TokensPicker(BaseEstimator, TransformerMixin):

    def __init__(self, embed_dim=10, top_n=10, window=5, min_count=1):
        self.embedding_dim = embed_dim
        self.top_n_token = top_n
        self.window = window
        self.min_count = min_count

    def generate_word_embedding_from_df(self, df, col='tokens', vect_dims=100, window=5, min_count=1, workers=4, seed=11):
        saved_model_name = w2v_file_name_from_parameters(col, vect_dims, window, min_count, seed)
        if os.path.exists(saved_model_name):
            return Word2Vec.load(saved_model_name)
        else:
            all_token_lists = df[col]
            # size: The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token(word)
            # sg: THe training algorithm, either CBOW(0) or skip gram(1).
            # window: The maximum distance between a target word and words around the target word.
            # min_count: The minimum count of words to consider when training the model; words with an occurence less than this count will be ignored.
            w2v_model = Word2Vec(sentences=all_token_lists, size=vect_dims, sg=1, window=window, min_count=min_count,
                                 seed=seed, workers=workers)
            w2v_model.save(saved_model_name)
            return w2v_model

    def fit(self, df, y=None):
        self.w2v_model = self.generate_word_embedding_from_df(df,
                                                    col='tokens',
                                                    vect_dims=self.embedding_dim
                                                    )
        self.vectorizer = TfidfVectorizer()
        self.vectors = self.vectorizer.fit_transform(df['sentence'].tolist())
        return self

    def get_top_tokens_in_doc(self, df, vectors, row_id, top_n=25):
        row = np.squeeze(vectors[row_id].toarray())
        tokens = df.loc[row_id]['tokens']
        token_length = len(tokens)
        #     print('Token length: ', str(token_length))
        token_values = {}
        for i in range(token_length):
            # Get tfidf score for each token
            token_name = tokens[i]
            try:
                if token_name in self.vectorizer.vocabulary_:
                    token_index = self.vectorizer.vocabulary_[token_name]
                    token_value = row[token_index]
                else:
                    token_value = 0
            except:
                print("Exception: ", str(row_id))
            token_values[token_name] = token_value
        # Sort the tokens by tfidf values
        sorted_tokens = sorted(token_values.items(), key=operator.itemgetter(1), reverse=True)
        #     print(sorted_tokens)
        # Get the most weighted tokens
        top_tokens = []
        padding_count = 0
        #     print("Sorted tokens length: ", str(len(sorted_tokens)))
        if len(sorted_tokens) < top_n:
            padding_count = top_n - len(sorted_tokens)
            for i in range(len(sorted_tokens)):
                top_tokens.append(sorted_tokens[i][0])
        else:
            for i in range(top_n):
                top_tokens.append(sorted_tokens[i][0])
        for i in range(padding_count):
            top_tokens.append('UNKNOWN')
        return top_tokens

    def convert_tokens_to_features(self, tokens):
        default_embedding = np.zeros(self.embedding_dim, dtype=int).tolist()
        features = []
        for t in tokens:
            if t in self.w2v_model:
                features += self.w2v_model[t].tolist()
            else:
                features += default_embedding
        return features

    def transform(self, df, y=None):
        df['top_tokens'] = df.apply(lambda x: self.get_top_tokens_in_doc(df,
                                                                       vectors=self.vectors,
                                                                       row_id=x.name,
                                                                       top_n=self.top_n_token),
                                    axis=1
                                  )
        df['features'] = df['top_tokens'].apply(lambda x: self.convert_tokens_to_features(x))
        df.to_csv('transformed_topn-{0}_embeddingdim-{1}.csv'.format(self.top_n_token, self.embedding_dim))
        return df[['features', 'class']]

In [71]:
feature_pipeline = Pipeline([
        ('Tokens_Picker_Pipeline', TokensPicker())
])

df_train_transformed = feature_pipeline.fit_transform(df_train)



In [72]:
df_train_transformed.head()

Unnamed: 0,features,class
0,"[-0.7738588452339172, 1.0097960233688354, -0.4...",2
1,"[-0.2603164315223694, 0.4835744798183441, -0.9...",2
2,"[-0.2622810900211334, 0.623058021068573, -0.42...",1
3,"[-0.4411155879497528, 0.5608793497085571, -0.2...",2
4,"[-0.10545478761196136, 0.49129846692085266, -0...",2


In [74]:
n_estimators = [100, 200, 500, 1000]
max_depth = [2, 4, 6, 8, 10]
min_samples_split = [500, 1000, 2000]

classifier_pipeline = Pipeline([
#         ('Tokens_Picker_Pipeline', TokensPicker())
#     ,
        ('Classifier', GradientBoostingClassifier())
    ])

param_grid = [{
#     'Tokens_Picker_Pipeline__embed_dim': embed_dims,
#     'Tokens_Picker_Pipeline__top_n': top_ns,
#     'Tokens_Picker_Pipeline__window': windows
#     ,
    'Classifier__n_estimators': n_estimators,
    'Classifier__max_depth': max_depth,
    'Classifier__min_samples_split': min_samples_split
}]

grid = GridSearchCV(classifier_pipeline, cv=5, n_jobs=4, param_grid=param_grid, scoring='neg_log_loss')
grid.fit(df_train_transformed['features'].tolist(), df_train['class'].tolist())

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('Classifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=...     presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid=[{'Classifier__n_estimators': [20, 50, 80, 100], 'Classifier__max_depth': [2, 4, 6, 8, 10], 'Classifier__min_samples_split': [200, 500, 800, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='neg_log_loss',
       verbose=0)

In [80]:
grid.best_params_

{'Classifier__max_depth': 6,
 'Classifier__min_samples_split': 1000,
 'Classifier__n_estimators': 100}

In [79]:
grid.grid_scores_

[mean: -1.20905, std: 0.02600, params: {'Classifier__max_depth': 2, 'Classifier__min_samples_split': 200, 'Classifier__n_estimators': 20},
 mean: -1.02622, std: 0.02806, params: {'Classifier__max_depth': 2, 'Classifier__min_samples_split': 200, 'Classifier__n_estimators': 50},
 mean: -0.96456, std: 0.02500, params: {'Classifier__max_depth': 2, 'Classifier__min_samples_split': 200, 'Classifier__n_estimators': 80},
 mean: -0.94400, std: 0.02847, params: {'Classifier__max_depth': 2, 'Classifier__min_samples_split': 200, 'Classifier__n_estimators': 100},
 mean: -1.23275, std: 0.02442, params: {'Classifier__max_depth': 2, 'Classifier__min_samples_split': 500, 'Classifier__n_estimators': 20},
 mean: -1.03369, std: 0.02537, params: {'Classifier__max_depth': 2, 'Classifier__min_samples_split': 500, 'Classifier__n_estimators': 50},
 mean: -0.96920, std: 0.02623, params: {'Classifier__max_depth': 2, 'Classifier__min_samples_split': 500, 'Classifier__n_estimators': 80},
 mean: -0.94670, std: 0.02

In [53]:
model_0 = feature_pipeline.fit(df_train, df_train['class'].tolist())



In [57]:
model_0.predict(df_train)



array([2, 2, 1, ..., 6, 5, 4])

In [7]:
df_x_transformed = df_train_transformed.drop(['class'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(df_x_transformed, df_train_transformed['class'], test_size=0.2, random_state=11, stratify=df_train['class'])

In [13]:
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(x_train['features'].tolist(), y_train.tolist())

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=10, subsample=1.0, verbose=0,
              warm_start=False)

In [14]:
# Predict training set
train_predictions = gbm0.predict(x_test['features'].tolist())
train_predprob = gbm0.predict_proba(x_test['features'].tolist())[:1]

In [15]:
train_predictions[:5]

array([6, 3, 3, 4, 4])

In [18]:
train_predprob

array([[1.66337170e-04, 4.03305413e-04, 4.55247937e-03, 1.76416634e-01,
        1.05184377e-02, 7.76706779e-01, 9.19678397e-03, 5.20889555e-03,
        1.52967988e-02, 1.17838339e-03, 3.55165810e-04]])

In [19]:
acc_score = metrics.accuracy_score(y_test.tolist(), train_predictions)
print(acc_score)

0.6460732984293194


In [24]:
auc_score = metrics.log_loss(y_train.tolist(), gbm0.predict_proba(x_train['features'].tolist()))
print(auc_score)

0.4869566001388137
