In [1]:
import os
import json
import pickle

import numpy as np
import pandas as pd

import fasttext
import fasttext.util

import tensorflow as tf
import tensorflow_text

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support, classification_report, precision_recall_curve

import spacy

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
array = np.loadtxt(os.getcwd() + '/../dataframes/210705_news_328064.txt', delimiter=',').reshape(1, -1)
array.shape

(1, 512)

In [3]:
PRETRAINED_FASTTRACK_MODEL = os.getcwd() + '/../../models/lid.176.bin'
LANG_EN = "__label__en"
LANG_DE = "__label__de"

fasttext.FastText.eprint = lambda x: None
fasttext_model = fasttext.load_model(PRETRAINED_FASTTRACK_MODEL)

def detect_language(text):
    
    lang_label = fasttext_model.predict(text)[0][0].split('__label__')[1]
    return lang_label

In [4]:
text = """미소와 함께 '일상의 변화'를 만들 '미소 메이커스'를 찾습니다  미소는 국내 No.1 O2O 홈서비스 플랫폼 기업입니다. "Hotel-like service in your home"  이라는 비전 아래, 고객에게 더욱 행복한 경험을 더욱 많이 제공하고자 합니다. 대표 서비스인 '홈클리닝'을 중심으로, '이사'/'가전청소'/'인테리어'/'펫시팅' 등  70여가지의 서비스 로 사업 범위를 확장하였습니다. 고객 만족 원칙과 데이터 기반 기술로 매년 2배가량 돋보이게 성장하고 있습니다. 2021년 현재 Series A 단계이며, 총 투자규모는 약 130억 원입니다. 한국 O2O 기업으로는 최초로 실리콘밸리의 최대 벤처 투자사 ‘Y Combinator’로부터 31억 원의 투자를 유치했습니다. 누적 매출액은  1,000억 원 , 누적 주문건수  300만 건  및 누적 파트너수  40,000명 을 돌파했습니다.  미소의 일 하는 방식 엿보기 Work hard on the Right Things 미소는 올바른 일에 집중합니다. 미소 팀블로그에서 미소 메이커스의 이야기를 들어 보세요!    예비 미소 메이커스를 위한 참고 사이트   채용과 관련한 모든 문의사항은,  recruit@getmiso.com (People Team) 으로 부탁드립니다."""
fasttext_model.predict(text)[0][0].split('__label__')[1]

'ko'

In [6]:
tf.saved_model.LoadOptions(
    allow_partial_checkpoint=False,
    experimental_io_device='/job:localhost',
    experimental_skip_checkpoint=False
)

tf_model = tf.keras.models.load_model(
    os.getcwd() + '/../../models/USE_model/'
)



In [7]:
def get_svm_classifier():
    return SVC(kernel='rbf', gamma='auto', class_weight='balanced', probability=True, random_state=122)

def get_rf_classifier():
    return RandomForestClassifier(n_estimators=300, random_state=122)

def get_lr_classifier():
    return LogisticRegression(class_weight='balanced', random_state=122)

def get_three_class_models():
    
    model_1 = get_svm_classifier()
    model_2 = get_svm_classifier()
    model_3 = get_lr_classifier()
    
    return [model_1, model_2, model_3]

def get_modified_vectors(vec_data):
    
    new_data = []
    for val in vec_data:
        new_data.append(val)
    
    new_data = np.array(new_data).reshape(-1, 512)
    return new_data

In [8]:
def get_threshold_output(preds, threshold):
    
    y_preds = []
    for val in preds:
        if val >= threshold:
            y_preds.append(1)
        else:
            y_preds.append(0)
        
    y_preds = np.array(y_preds)
    
    return y_preds

def get_f1_score_binary(y, preds, threshold, print_report=False):
    
    y_preds = get_threshold_output(preds, threshold)
    
    metrics = precision_recall_fscore_support(y, y_preds)
    f1_score_tech = metrics[2][1]
    precision_tech = metrics[0][1]
    recall_tech = metrics[1][1]
    
    if print_report:
        print(classification_report(y, y_preds))
        return f1_score_tech, precision_tech, recall_tech
    
    return f1_score_tech

def get_best_threshold(y, preds):
    
    threshold_vals = np.arange(0.1, 1, 0.001)
    f1_score_list = []
    
    for val in threshold_vals:
        f1_score_list.append(get_f1_score_binary(y, preds, val))

    max_idx = np.nanargmax(f1_score_list)
    thre_max = threshold_vals[max_idx]
    fscore = f1_score_list[max_idx]
    
    print(fscore)
    print(thre_max)
    
    return thre_max    

def get_trained_model_binary(X, y):
    
#     skf_f1score = perform_cross_validation(X, y, fold_cnt=5)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=123)
    
    model = get_svm_classifier()
    clf = make_pipeline(StandardScaler(with_mean=False), model)
    clf.fit(X_train, y_train)
    
    pred_probs = clf.predict_proba(X_test)[:,1]
    threshold = get_best_threshold(y_test, pred_probs)
    
    skf_f1score = 0
    
    return clf, skf_f1score, threshold

def get_test_f1score_binary(model, X_test, y_test, threshold):
    
    preds = model.predict_proba(X_test)
    preds = preds[:,1]

    f1_score = get_f1_score_binary(y_test, preds, threshold, print_report=True)
    
    return f1_score

In [9]:
def get_multi_class_metrics(y_test, preds, pr_flag=False):
    
    metrics = precision_recall_fscore_support(y_test, preds)
    
    precision_tech = metrics[0][1]
    precision_milt = metrics[0][2]
    
    recall_tech = metrics[1][1]
    recall_milt = metrics[1][2]
    
    f1_score_tech = metrics[2][1]
    f1_score_milt = metrics[2][2]
    
#     f1_score_milt = metrics[2][1]
    
    f1_score = (f1_score_tech+f1_score_milt)/2
    
    if pr_flag:
        print(classification_report(y_test, preds))
        return f1_score, preds, f1_score_milt, precision_milt, recall_milt
    
    return f1_score

def perform_cross_validation(X, y, fold_cnt=5):
    
    skf = StratifiedKFold(n_splits=fold_cnt, shuffle=True, random_state=123)
    f1_scores_list = []
    
    for train_idx, test_idx in skf.split(X, y):
        
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = get_svm_classifier()
        clf = make_pipeline(StandardScaler(with_mean=False), model)
        
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)

        f1_score = get_multi_class_metrics(y_test, preds, pr_flag=False)
        f1_scores_list.append(f1_score)
    
    return sum(f1_scores_list)/fold_cnt

def get_trained_model(X, y):
    
    skf_f1score = perform_cross_validation(X, y, fold_cnt=5)
    
    model = get_svm_classifier()
    clf = make_pipeline(StandardScaler(with_mean=False), model)
    clf.fit(X, y)
    
    return clf, skf_f1score

def get_test_f1score(model, X_test, y_test):
    
    preds = model.predict(X_test)
    f1_score, preds, f1_score_milt, precision_milt, recall_milt = get_multi_class_metrics(y_test, preds, pr_flag=True)
    
    return f1_score, preds, f1_score_milt, precision_milt, recall_milt

def get_performance_metrics(X_train, y_train, X_test, y_test, binary=False):
    
    if binary:
        model, cv_score, threshold = get_trained_model_binary(X_train, y_train)
        test_f1_score, precision_tech, recall_tech = get_test_f1score_binary(model, X_test, y_test,threshold)
        
        return model, threshold, test_f1_score, precision_tech, recall_tech
    else:    
        model, test_cv_score = get_trained_model(X_train, y_train)
        test_f1_score, preds, f1_score_milt, precision_milt, recall_milt = get_test_f1score(model, X_test, y_test)
        
        test_df['pred_label'] = preds

        return model, test_cv_score, test_f1_score, f1_score_milt, precision_milt, recall_milt

    print()
    print(f'Training CV f1 score: {cv_score}')
    print(f'Test F1-score: {test_f1_score}')

In [10]:
train_df = pd.read_pickle(os.getcwd() + '/../dataframes/train_df_features.pkl')  ## train_df
test_df = pd.read_pickle(os.getcwd() + '/../dataframes/test_df_features.pkl') ## test_df

train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
unlabeled_df = pd.read_pickle(os.getcwd() + '/../dataframes/unlabeled_data_feature_extracted.pkl') # unlabeled_df
unlabeled_df

In [None]:
unlabeled_df = pd.read_pickle(os.getcwd() + '/../dataframes/unlabeled_df_features.pkl') # unlabeled_df
# unlabeled_df['lang'] = unlabeled_df.apply(lambda x:detect_language(x['text'].revplace("\n"," ")), axis=1)

In [11]:
X_train_use = get_modified_vectors(train_df.doc_vec.values)
X_test_use = get_modified_vectors(test_df.doc_vec.values)

In [15]:
# train_df['nc_vec'] = train_df.apply(lambda x:get_avg_token_vector(x['text_tokens'][0]), axis=1)
# test_df['nc_vec'] = test_df.apply(lambda x:get_avg_token_vector(x['text_tokens'][0]), axis=1)

X_train_nc = get_modified_vectors(train_df.nc_vec.values)
X_test_nc = get_modified_vectors(test_df.nc_vec.values)

In [16]:
y_train = train_df.label.values
y_test = test_df.label.values

y_train_new = np.array([val if val!=3 else 0 for val in y_train]).astype('int32')
y_test_new = np.array([val if val!=3 else 0 for val in y_test]).astype('int32')

In [17]:
def get_avg_token_vector(token_list):
    
    avg_token_vec = []
    for token in token_list:
        avg_token_vec.append(tf_model(token)['outputs'].numpy()[0].reshape(1, -1))
        
    return np.mean(avg_token_vec, axis=0)

def perform_analysis(model_type):
    multi_model, test_cv_score, test_f1_score, f1_score_milt, precision_milt, recall_milt = get_performance_metrics(X_train_nc, y_train_new, X_test_nc, y_test_new)

    test_df_new = test_df[test_df['pred_label'].isin([0,1,3])]
    y_train_tech = np.array([0 if val!=1 else 1 for val in y_train]).astype('int32')

    X_test_new_nc = get_modified_vectors(test_df_new.nc_vec.values)
    y_test_new_tech = np.array([0 if val!=1 else 1 for val in test_df_new.label.values]).astype('int32')

    bin_model, threshold, f1_score_tech, precision_tech, recall_tech = get_performance_metrics(X_train_nc, y_train_tech, X_test_new_nc, y_test_new_tech, binary=True)
    
    pickle.dump(multi_model, open(os.getcwd()+'/../../models/multi_model_stage_1.pkl', 'wb'))
    pickle.dump(bin_model, open(os.getcwd()+'/../../models/bin_model_stage_2.pkl', 'wb'))
    
    results_list = [test_cv_score, test_f1_score,f1_score_milt,  precision_milt, recall_milt, f1_score_tech, precision_tech, recall_tech]
    # filter_unlabeled_data(multi_model, bin_model)

    results_list = [str(round(val, 2)) for val in results_list]
    results_list.append(model_type)

    write_data_to_file(os.getcwd()+'/../csv_data/two_stage_classification_results.txt', '|'.join(results_list))

In [20]:
perform_analysis(model_type='use')

              precision    recall  f1-score   support

           0       0.91      0.86      0.88       146
           1       0.30      0.19      0.23        16
           2       0.53      0.83      0.64        23

    accuracy                           0.80       185
   macro avg       0.58      0.63      0.59       185
weighted avg       0.81      0.80      0.80       185

0.25
0.10900000000000001
              precision    recall  f1-score   support

           0       0.97      0.80      0.88       137
           1       0.24      0.75      0.37        12

    accuracy                           0.79       149
   macro avg       0.61      0.77      0.62       149
weighted avg       0.91      0.79      0.83       149



In [19]:
def write_data_to_file(filepath, data):

    with open(filepath, "a") as f:
        f.write(data+'\n')

def write_document_data(data, filepath):

    with open(filepath, 'w') as f:
        json.dump(data, f)

def filter_unlabeled_data(multi_model, bin_model):
    
    X_unlabeled = get_modified_vectors(unlabeled_df.nc_vec.values)
    unlabeled_df['milt_label'] = multi_model.predict(X_unlabeled)

    unlabeled_df_new = unlabeled_df[unlabeled_df['milt_label'].isin([0,1,3])]
    X_unlabeled_new = get_modified_vectors(unlabeled_df_new.nc_vec.values)

    preds = bin_model.predict_proba(X_unlabeled_new)[:,1]
    unlabeled_df_new['tech_label'] = get_threshold_output(preds, threshold)

    tech_data_dict = dict()
    milt_data_dict = dict()

    for idx, row in unlabeled_df_new.iterrows():

        if row['tech_label'] == 1:
            tech_data_dict[row['id']] = {
                'page_id': row['id'],
                'text': row['text']
            }

    for idx, row in unlabeled_df.iterrows():

        if row['milt_label'] == 2:
            milt_data_dict[row['id']] = {
                'page_id': row['id'],
                'text': row['text']
            }
            
    tech_len = len(tech_data_dict.keys())    
    milt_len = len(milt_data_dict.keys())    
    
    print(f'Technology documents filtered: {tech_len}')
    print(f'Military documents filtered: {milt_len}')

    write_document_data(tech_data_dict, os.getcwd()+'/../json_data/technologie_document_data.json')
    write_document_data(milt_data_dict, os.getcwd()+'/../json_data/military_document_data.json')

In [366]:
get_performance_metrics(X_train_new_nc, y_train_new_tech, X_test_new_nc, y_test_new_tech, binary=True)

0.3
0.10500000000000001
              precision    recall  f1-score   support

           0       0.95      0.78      0.86       134
           1       0.19      0.58      0.29        12

    accuracy                           0.76       146
   macro avg       0.57      0.68      0.57       146
weighted avg       0.89      0.76      0.81       146


Training CV f1 score: 0
Test F1-score: 0.28571428571428575


In [328]:
get_performance_metrics(X_train_topic_use, y_train, X_test_topic_use, y_test_tech, binary=True)

0.3529411764705882
0.13100000000000003
              precision    recall  f1-score   support

           0       0.92      0.85      0.88       169
           1       0.13      0.25      0.17        16

    accuracy                           0.79       185
   macro avg       0.53      0.55      0.53       185
weighted avg       0.85      0.79      0.82       185


Training CV f1 score: 0
Test F1-score: 0.1739130434782609


In [26]:
get_performance_metrics(X_train_use, y_train, X_test_use, y_test)

              precision    recall  f1-score   support

           0       0.81      0.73      0.77       113
           1       0.43      0.19      0.26        16
           2       0.45      0.78      0.57        23
           3       0.38      0.42      0.40        33

    accuracy                           0.63       185
   macro avg       0.52      0.53      0.50       185
weighted avg       0.66      0.63      0.63       185


Training CV f1 score: 0.41471200104976197
Test F1-score: 0.4161490683229813


### 2. Topic features testing

In [9]:
def get_use_topic_score(doc_vec, topic):
    
    return cosine_similarity(doc_vec, topic_embeddings_dict[topic])[0][0]

In [324]:
topic_list = ['Wirtschaft und Finanzen', 'Bildung', 'Politik', 'Tierreich', 'Rechtswissenschaften und Rechtsprechung', 'Gesundheit', 'Automobilbranche', 'Unterhaltung', 'Sport', 'Werbung', 'Technologie', 'Innovation', 'Militär', 'Quantencomputer', 'Swarm', 'Architecture', 'Forschnung', 'Drone', 'Autonomous', 'Modernisierung', 'Prototype', 'efficiency', 'Notebook', 'Angriff', 'Smartphone', 'Corona', 'Hacking', 'Kunden', 'Robot', 'Künstliche Intelligenz', 'smart', 'algorithmus', 'sensor', 'energy', 'digitalen', 'attack']
# topic_list = ['Werbung', 'Technologie', 'Innovation', 'Militär', 'Quantencomputer', 'Swarm', 'Architecture', 'Forschnung', 'Drone', 'Autonomous', 'Modernisierung', 'Prototype', 'efficiency', 'Notebook', 'Angriff', 'Smartphone', 'Corona', 'Hacking', 'Kunden', 'Robot', 'Künstliche Intelligenz', 'smart', 'algorithmus', 'sensor', 'energy', 'digitalen', 'attack']

topic_embeddings_dict = dict()
for topic in topic_list:
    topic_embeddings_dict[topic] = tf_model(topic)['outputs'].numpy()[0].reshape(1, -1)

In [325]:
topic_col_list = []

for topic in topic_list:
    topic_col_name = topic.lower().replace(' ', '_') + '_sim'
    topic_col_list.append(topic_col_name)
    
    train_df[topic_col_name] = train_df.apply(lambda x:get_use_topic_score(x['doc_vec'], topic), axis=1)
    test_df[topic_col_name] = test_df.apply(lambda x:get_use_topic_score(x['doc_vec'], topic), axis=1)

In [326]:
X_train_topic_use = train_df[topic_col_list].values
X_test_topic_use = test_df[topic_col_list].values

In [249]:
get_performance_metrics(X_train_topic_use, y_train, X_test_topic_use, y_test)

              precision    recall  f1-score   support

           0       0.81      0.60      0.69       113
           1       0.10      0.12      0.11        16
           2       0.39      0.61      0.47        23
           3       0.30      0.39      0.34        33

    accuracy                           0.52       185
   macro avg       0.40      0.43      0.40       185
weighted avg       0.60      0.52      0.55       185


Training CV f1 score: 0.348627069009422
Test F1-score: 0.29134218964727443


### 3. Noun-chunks, Verb and Adjective

In [236]:
nlp_de = spacy.load("de_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

In [13]:
def get_document_data(text, lang):
    
    doc = None
    
    if lang == 'en':
        doc = nlp_en(text)
    elif lang == 'de':
        doc = nlp_de(text)
    elif lang == 'ko':
        return None
    
    noun_phrases_list = []
    verbs_list = []
    adjs_list = []
    
    for nc in doc.noun_chunks:
        noun_phrases_list.append(nc.lemma_) 
        
    for token in doc:
        if token.pos_ == "ADJ":
            adjs_list.append(token.lemma_)
        elif token.pos_ == "VERB":
            verbs_list.append(token.lemma_)
            
    return (noun_phrases_list, verbs_list, adjs_list)

def get_avg_token_vector(token_list):
    
    avg_token_vec = []
    for token in token_list:
        avg_token_vec.append(tf_model(token)['outputs'].numpy()[0].reshape(1, -1))
        
    return np.mean(avg_token_vec, axis=0)

def get_mean_vector(vec_list):
    return np.mean(vec_list, axis=0)

In [244]:
unlabeled_df.lang.value_counts()

en    1284
de     525
Name: lang, dtype: int64

In [241]:
unlabeled_df['text_tokens'] = unlabeled_df.apply(lambda x:get_document_data(x['text'], x['lang']), axis=1)

In [243]:
unlabeled_df = unlabeled_df[unlabeled_df['lang'].isin(['en', 'de'])]

In [245]:
unlabeled_df.to_pickle(os.getcwd() + '/../dataframes/unlabeled_df_features.pkl')

In [15]:
# train_df['text_tokens'] = train_df.apply(lambda x:get_document_data(x['text'], x['lang']), axis=1)
# test_df['text_tokens'] = test_df.apply(lambda x:get_document_data(x['text'], x['lang']), axis=1)

In [16]:
# train_df['nc_vec'] = train_df.apply(lambda x:get_avg_token_vector(x['text_tokens'][0]), axis=1)
# train_df['verb_vec'] = train_df.apply(lambda x:get_avg_token_vector(x['text_tokens'][1]), axis=1)
# train_df['adj_vec'] = train_df.apply(lambda x:get_avg_token_vector(x['text_tokens'][2]), axis=1)

# test_df['nc_vec'] = test_df.apply(lambda x:get_avg_token_vector(x['text_tokens'][0]), axis=1)
# test_df['verb_vec'] = test_df.apply(lambda x:get_avg_token_vector(x['text_tokens'][1]), axis=1)
# test_df['adj_vec'] = test_df.apply(lambda x:get_avg_token_vector(x['text_tokens'][2]), axis=1)

In [61]:
# train_df.to_pickle(os.getcwd() + '/../dataframes/train_df_features.pkl')
# test_df.to_pickle(os.getcwd() + '/../dataframes/test_df_features.pkl')

In [43]:
X_train_nc = get_modified_vectors(train_df.nc_vec.values)
X_test_nc = get_modified_vectors(test_df.nc_vec.values)

X_train_verb = get_modified_vectors(train_df.verb_vec.values)
X_test_verb = get_modified_vectors(test_df.verb_vec.values)

X_train_adj = get_modified_vectors(train_df.adj_vec.values)
X_test_adj = get_modified_vectors(test_df.adj_vec.values)

In [7]:
def get_features_concatenate(vecs_1, vecs_2):
    return np.concatenate((vecs_1,vecs_2), axis=1)

def get_features_mean(vecs_1, vecs_2):
    
    return np.mean( (vecs_1,vecs_2), axis=0 )

In [35]:
X_train_nc_ad = get_features_mean(X_train_nc, X_train_adj)
X_test_nc_ad = get_features_mean(X_test_nc, X_test_adj)

In [36]:
get_performance_metrics(X_train_nc_ad, y_train, X_test_nc_ad, y_test)

              precision    recall  f1-score   support

           0       0.82      0.63      0.71       113
           1       0.30      0.19      0.23        16
           2       0.45      0.83      0.58        23
           3       0.35      0.48      0.41        33

    accuracy                           0.59       185
   macro avg       0.48      0.53      0.48       185
weighted avg       0.64      0.59      0.60       185


Training CV f1 score: 0.42320817864428406
Test F1-score: 0.40769230769230763


In [11]:
get_performance_metrics(X_train_nc, y_train, X_test_nc, y_test)

              precision    recall  f1-score   support

           0       0.80      0.67      0.73       113
           1       0.38      0.19      0.25        16
           2       0.49      0.83      0.61        23
           3       0.35      0.45      0.39        33

    accuracy                           0.61       185
   macro avg       0.50      0.54      0.50       185
weighted avg       0.64      0.61      0.61       185



Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('svc',
                 SVC(class_weight='balanced', gamma='auto', probability=True,
                     random_state=122))])

In [28]:
get_performance_metrics(X_train_verb, y_train, X_test_verb, y_test)

              precision    recall  f1-score   support

           0       0.81      0.76      0.79       113
           1       0.40      0.25      0.31        16
           2       0.45      0.65      0.54        23
           3       0.25      0.27      0.26        33

    accuracy                           0.62       185
   macro avg       0.48      0.48      0.47       185
weighted avg       0.63      0.62      0.62       185


Training CV f1 score: 0.2532680007508382
Test F1-score: 0.4217032967032967


In [29]:
get_performance_metrics(X_train_adj, y_train, X_test_adj, y_test)

              precision    recall  f1-score   support

           0       0.75      0.68      0.71       113
           1       0.36      0.31      0.33        16
           2       0.48      0.61      0.54        23
           3       0.36      0.42      0.39        33

    accuracy                           0.59       185
   macro avg       0.49      0.51      0.49       185
weighted avg       0.61      0.59      0.60       185


Training CV f1 score: 0.26448179271708677
Test F1-score: 0.4358974358974359


### 4. Combined models

In [31]:
def create_stage_one_models(X_1, X_2, y, model_1, model_2):
    
    model_1 = make_pipeline(StandardScaler(with_mean=False), model_1)
    model_1.fit(X_1, y)
    
    model_2 = make_pipeline(StandardScaler(with_mean=False), model_2)
    model_2.fit(X_2, y)  
    
    return model_1, model_2

def create_stage_two_model(X, y, model_3):
    
    model_3 = make_pipeline(StandardScaler(with_mean=False), model_3)
    model_3.fit(X, y) 
    
    return model_3

def get_transformed_features(model_1, model_2, X_1, X_2, y, y_flag=False):
    
    preds_1 = model_1.predict_proba(X_1)
    preds_2 = model_2.predict_proba(X_2)
    
    combined_features = get_features_concatenate(preds_1, preds_2)
    if not y_flag:
        return combined_features
    
    return combined_features, y

def get_finalmodel_pipeline(models, X_train_1, X_train_2, y_train, y_flag=False):
    
    model_1, model_2 = create_stage_one_models(X_train_1, X_train_2, y_train, models[0], models[1])
    X, y = get_transformed_features(model_1, model_2, X_train_1, X_train_2, y_train, y_flag=True)
    
    model_3 = create_stage_two_model(X, y, models[2])
    
    return [model_1, model_2, model_3]

def get_test_f1score_combined(models, X_test_1, X_test_2, y_test):
    
    X_test = get_transformed_features(models[0], models[1], X_test_1, X_test_2, y_test, y_flag=False)
    preds = models[2].predict(X_test)
    
    f1_score = get_multi_class_metrics(y_test, preds, pr_flag=True)
    
    return f1_score

In [186]:
def perform_cross_validation_combined(X_1, X_2, y, fold_cnt=5):
    
    skf = StratifiedKFold(n_splits=fold_cnt, shuffle=True, random_state=123)
    f1_scores_list = []
    
    for train_idx, test_idx in skf.split(X_1, y):
        
        X_train_1, X_train_2, X_test_1, X_test_2 = X_1[train_idx], X_2[train_idx], X_1[test_idx], X_2[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        models = get_three_class_models()
        models = get_finalmodel_pipeline(models, X_train_1, X_train_2, y_train)
        
        X_test = get_transformed_features(models[0], models[1], X_test_1, X_test_2, y_test, y_flag=False)
        preds = models[2].predict(X_test)

        f1_score = get_multi_class_metrics(y_test, preds, pr_flag=False)
        f1_scores_list.append(f1_score)
    
    return sum(f1_scores_list)/fold_cnt

def get_trained_model_combined(X_1, X_2, y):
    
    skf_f1score = perform_cross_validation_combined(X_1, X_2, y, fold_cnt=5)
    
    models = get_three_class_models()
    models = get_finalmodel_pipeline(models, X_1, X_2, y)

    return models, skf_f1score
    

def get_performance_metrics_combined(X_train_1, X_train_2, y_train, X_test_1, X_test_2, y_test):
    
    models, cv_score = get_trained_model_combined(X_train_1, X_train_2, y_train)
    test_f1_score = get_test_f1score_combined(models, X_test_1, X_test_2, y_test)

    print()
    print(f'Training CV f1 score: {cv_score}')
    print(f'Test F1-score: {test_f1_score}')

In [207]:
get_performance_metrics_combined(X_train_nc, X_train_verb, y_train, X_test_nc, X_test_verb, y_test)

              precision    recall  f1-score   support

           0       0.69      0.87      0.77       113
           1       1.00      0.12      0.22        16
           2       0.44      0.17      0.25        23
           3       0.38      0.36      0.37        33

    accuracy                           0.63       185
   macro avg       0.63      0.38      0.40       185
weighted avg       0.63      0.63      0.59       185


Training CV f1 score: 0.24217057796005168
Test F1-score: 0.2361111111111111


In [210]:
get_performance_metrics_combined(X_train_nc, X_train_adj, y_train, X_test_nc, X_test_adj, y_test)

              precision    recall  f1-score   support

           0       0.67      0.88      0.76       113
           1       1.00      0.06      0.12        16
           2       0.60      0.26      0.36        23
           3       0.33      0.24      0.28        33

    accuracy                           0.62       185
   macro avg       0.65      0.36      0.38       185
weighted avg       0.63      0.62      0.57       185


Training CV f1 score: 0.31169002050580996
Test F1-score: 0.24064171122994654


In [209]:
get_performance_metrics_combined(X_train_nc, X_train_use, y_train, X_test_nc, X_test_use, y_test)

              precision    recall  f1-score   support

           0       0.69      0.89      0.78       113
           1       1.00      0.12      0.22        16
           2       0.50      0.30      0.38        23
           3       0.39      0.27      0.32        33

    accuracy                           0.64       185
   macro avg       0.65      0.40      0.43       185
weighted avg       0.64      0.64      0.60       185


Training CV f1 score: 0.3365809354044648
Test F1-score: 0.3003003003003003
