In [1]:
import numpy as np, pandas as pd
import ast 
from sklearn import linear_model
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
# en_nlp = spacy.load('en')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV



In [2]:
data = pd.read_csv("train_detect_sent.csv").reset_index(drop=True)

In [3]:
data.shape

(1000, 12)

In [4]:
data.head(3)

Unnamed: 0,answer_start,context,question,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,31,Acrokeratosis paraneoplastica (Bazex syndrome)...,Name synonym of Acrokeratosis paraneoplastica.,Bazex syndrome,['Acrokeratosis paraneoplastica (Bazex syndrom...,4,"[array([-0.00387298, 0.14592458, 0.10885772,...",[[-0.0094591 0.11474403 0.00712491 ... -0.0...,"[1.0273456014692783, 1.0879644602537155, 1.045...","[20.125471, 23.322601, 21.326996, 24.86485, 22...",0,0
1,167,Acrokeratosis paraneoplastica (Bazex syndrome)...,Name synonym of Acrokeratosis paraneoplastica.,Bazex syndrome,['Acrokeratosis paraneoplastica (Bazex syndrom...,4,"[array([-0.00387298, 0.14592458, 0.10885772,...",[[-0.0094591 0.11474403 0.00712491 ... -0.0...,"[1.0273456014692783, 1.0879644602537155, 1.045...","[20.125471, 23.322601, 21.326996, 24.86485, 22...",0,0
2,0,Bazex syndrome (acrokeratosis paraneoplastica)...,Name synonym of Acrokeratosis paraneoplastica.,Bazex syndrome,['Bazex syndrome (acrokeratosis paraneoplastic...,0,"[array([-0.00387298, 0.14095132, 0.0979467 ,...",[[-0.0094591 0.11474403 0.00712491 ... -0.0...,"[0.8510962277650833, 0.8911607936024666, 1.061...","[11.899525, 17.746927, 18.648937, 18.363045]",0,0


In [5]:
ast.literal_eval(data["sentences"][0])

['Acrokeratosis paraneoplastica (Bazex syndrome): report of a case associated with small cell lung carcinoma and review of the literature.',
 'Acrokeratosis paraneoplastic (Bazex syndrome) is a rare, but distinctive paraneoplastic dermatosis characterized by erythematosquamous lesions located at the acral sites and is most commonly associated with carcinomas of the upper aerodigestive tract.',
 'We report a 58-year-old female with a history of a pigmented rash on her extremities, thick keratotic plaques on her hands, and brittle nails.',
 'Chest imaging revealed a right upper lobe mass that was proven to be small cell lung carcinoma.',
 'While Bazex syndrome has been described in the dermatology literature, it is also important for the radiologist to be aware of this entity and its common presentations.']

In [6]:
data = data[data["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11].reset_index(drop=True)

In [7]:
def create_features(data):
    train = pd.DataFrame()
     
    for k in range(len(data["euclidean_dis"])):
        dis = ast.literal_eval(data["euclidean_dis"][k])
        for i in range(len(dis)):
            train.loc[k, "column_euc_"+"%s"%i] = dis[i]
    
    print("Finished")
    
    for k in range(len(data["cosine_sim"])):
        dis = ast.literal_eval(data["cosine_sim"][k].replace("nan","1"))
        for i in range(len(dis)):
            train.loc[k, "column_cos_"+"%s"%i] = dis[i]
            
    train["target"] = data["target"]
    return train

In [8]:
train = create_features(data)

Finished


In [9]:
del data

In [10]:
train.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9,...,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,20.125471,23.322601,21.326996,24.86485,22.200169,,,,,,...,1.087964,1.045396,1.159387,1.041414,,,,,,4
1,20.125471,23.322601,21.326996,24.86485,22.200169,,,,,,...,1.087964,1.045396,1.159387,1.041414,,,,,,4
2,11.899525,17.746927,18.648937,18.363045,,,,,,,...,0.891161,1.061656,0.972775,,,,,,,0


In [11]:
# train.fillna(10000, inplace=True)

In [12]:
train.head(3).transpose()

Unnamed: 0,0,1,2
column_euc_0,20.125471,20.125471,11.899525
column_euc_1,23.322601,23.322601,17.746927
column_euc_2,21.326996,21.326996,18.648937
column_euc_3,24.86485,24.86485,18.363045
column_euc_4,22.200169,22.200169,
column_euc_5,,,
column_euc_6,,,
column_euc_7,,,
column_euc_8,,,
column_euc_9,,,


### Fitting Multinomial Logistic Regression

### Standardize

In [13]:
train.apply(max, axis = 0)

column_euc_0    54.761063
column_euc_1    43.865790
column_euc_2    55.048767
column_euc_3    54.341896
column_euc_4    61.737247
column_euc_5          NaN
column_euc_6          NaN
column_euc_7          NaN
column_euc_8          NaN
column_euc_9          NaN
column_cos_0     1.027346
column_cos_1     1.123572
column_cos_2     1.094378
column_cos_3     1.159387
column_cos_4     1.124267
column_cos_5          NaN
column_cos_6          NaN
column_cos_7          NaN
column_cos_8          NaN
column_cos_9          NaN
target           9.000000
dtype: float64

In [14]:
subset1 = train.iloc[:,:10].fillna(60)
subset2 = train.iloc[:,10:].fillna(1)

In [15]:
 subset1.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9
0,20.125471,23.322601,21.326996,24.86485,22.200169,60.0,60.0,60.0,60.0,60.0
1,20.125471,23.322601,21.326996,24.86485,22.200169,60.0,60.0,60.0,60.0,60.0
2,11.899525,17.746927,18.648937,18.363045,60.0,60.0,60.0,60.0,60.0,60.0


In [16]:
 subset2.head(3)

Unnamed: 0,column_cos_0,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,1.027346,1.087964,1.045396,1.159387,1.041414,1.0,1.0,1.0,1.0,1.0,4
1,1.027346,1.087964,1.045396,1.159387,1.041414,1.0,1.0,1.0,1.0,1.0,4
2,0.851096,0.891161,1.061656,0.972775,1.0,1.0,1.0,1.0,1.0,1.0,0


In [17]:
train2 = pd.concat([subset1, subset2],axis=1, join_axes=[subset1.index])

In [18]:
train2.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9,...,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,20.125471,23.322601,21.326996,24.86485,22.200169,60.0,60.0,60.0,60.0,60.0,...,1.087964,1.045396,1.159387,1.041414,1.0,1.0,1.0,1.0,1.0,4
1,20.125471,23.322601,21.326996,24.86485,22.200169,60.0,60.0,60.0,60.0,60.0,...,1.087964,1.045396,1.159387,1.041414,1.0,1.0,1.0,1.0,1.0,4
2,11.899525,17.746927,18.648937,18.363045,60.0,60.0,60.0,60.0,60.0,60.0,...,0.891161,1.061656,0.972775,1.0,1.0,1.0,1.0,1.0,1.0,0


In [19]:
train2.apply(max, axis = 0)

column_euc_0    54.761063
column_euc_1    43.865790
column_euc_2    60.000000
column_euc_3    60.000000
column_euc_4    61.737247
column_euc_5    60.000000
column_euc_6    60.000000
column_euc_7    60.000000
column_euc_8    60.000000
column_euc_9    60.000000
column_cos_0     1.027346
column_cos_1     1.123572
column_cos_2     1.094378
column_cos_3     1.159387
column_cos_4     1.124267
column_cos_5     1.154278
column_cos_6     1.088804
column_cos_7     1.085611
column_cos_8     1.031210
column_cos_9     1.032421
target           9.000000
dtype: float64

In [20]:
scaler = MinMaxScaler()
X = scaler.fit_transform(train2.iloc[:,:-1])

In [21]:
X

array([[0.33880116, 0.50316327, 0.31492473, ..., 0.89940395, 0.96050316,
        0.95393715],
       [0.33880116, 0.50316327, 0.31492473, ..., 0.89940395, 0.96050316,
        0.95393715],
       [0.18176657, 0.36831568, 0.26748409, ..., 0.89940395, 0.96050316,
        0.95393715],
       ...,
       [0.13146173, 0.82738934, 0.26753011, ..., 0.89940395, 0.96050316,
        0.95393715],
       [0.13146173, 0.82738934, 0.26753011, ..., 0.89940395, 0.96050316,
        0.95393715],
       [0.13146173, 0.82738934, 0.26753011, ..., 0.89940395, 0.96050316,
        0.95393715]])

In [22]:
train_x, test_x, train_y, test_y = train_test_split(X,
train.iloc[:,-1], train_size=0.8, random_state = 5)

In [23]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))


Multinomial Logistic regression Train Accuracy :  0.43722943722943725
Multinomial Logistic regression Test Accuracy :  0.4051724137931034


### Logistic-Regression with Root Match feature

In [24]:
predicted = pd.read_csv("train_detect_sent.csv").reset_index(drop=True)

In [25]:
predicted = predicted[predicted["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11].reset_index(drop=True)

In [26]:
predicted.shape

(578, 12)

In [27]:
def get_columns_from_root(train):
    
    for i in range(train.shape[0]):
        if len(ast.literal_eval(train["root_match_idx"][i])) == 0: pass
        
        else:
            for item in ast.literal_eval(train["root_match_idx"][i]):
                train.loc[i, "column_root_"+"%s"%item] = 1
    return train

In [28]:
predicted = get_columns_from_root(predicted)

KeyError: 'root_match_idx'

In [29]:
predicted.head(3).transpose()

Unnamed: 0,0,1,2
answer_start,31,167,0
context,Acrokeratosis paraneoplastica (Bazex syndrome)...,Acrokeratosis paraneoplastica (Bazex syndrome)...,Bazex syndrome (acrokeratosis paraneoplastica)...
question,Name synonym of Acrokeratosis paraneoplastica.,Name synonym of Acrokeratosis paraneoplastica.,Name synonym of Acrokeratosis paraneoplastica.
text,Bazex syndrome,Bazex syndrome,Bazex syndrome
sentences,['Acrokeratosis paraneoplastica (Bazex syndrom...,['Acrokeratosis paraneoplastica (Bazex syndrom...,['Bazex syndrome (acrokeratosis paraneoplastic...
target,4,4,0
sent_emb,"[array([-0.00387298, 0.14592458, 0.10885772,...","[array([-0.00387298, 0.14592458, 0.10885772,...","[array([-0.00387298, 0.14095132, 0.0979467 ,..."
quest_emb,[[-0.0094591 0.11474403 0.00712491 ... -0.0...,[[-0.0094591 0.11474403 0.00712491 ... -0.0...,[[-0.0094591 0.11474403 0.00712491 ... -0.0...
cosine_sim,"[1.0273456014692783, 1.0879644602537155, 1.045...","[1.0273456014692783, 1.0879644602537155, 1.045...","[0.8510962277650833, 0.8911607936024666, 1.061..."
euclidean_dis,"[20.125471, 23.322601, 21.326996, 24.86485, 22...","[20.125471, 23.322601, 21.326996, 24.86485, 22...","[11.899525, 17.746927, 18.648937, 18.363045]"


In [30]:
subset3 = predicted[["column_root_0","column_root_1","column_root_2","column_root_3","column_root_4","column_root_5",\
             "column_root_6","column_root_7","column_root_8","column_root_9"]]

KeyError: "['column_root_0' 'column_root_1' 'column_root_2' 'column_root_3'\n 'column_root_4' 'column_root_5' 'column_root_6' 'column_root_7'\n 'column_root_8' 'column_root_9'] not in index"

In [31]:
subset3.fillna(0, inplace=True)

In [32]:
train3 = pd.concat([subset3, train2],axis=1, join_axes=[subset3.index])

In [33]:
train3.head(3).transpose()

Unnamed: 0,0,1,2
column_root_0,0.0,0.0,0.0
column_root_1,0.0,0.0,1.0
column_root_2,0.0,0.0,1.0
column_root_3,0.0,0.0,1.0
column_root_4,0.0,0.0,1.0
column_root_5,1.0,0.0,1.0
column_root_6,0.0,0.0,1.0
column_root_7,0.0,0.0,0.0
column_root_8,0.0,0.0,0.0
column_root_9,0.0,0.0,0.0


In [34]:
train3 = train3[["column_root_0","column_root_1","column_root_2","column_root_3","column_root_4","column_root_5",\
             "column_root_6","column_root_7","column_root_8","column_root_9", "column_cos_0","column_cos_1",\
           "column_cos_2","column_cos_3","column_cos_4","column_cos_5",\
             "column_cos_6","column_cos_7","column_cos_8","column_cos_9", "target"]]

In [35]:
train_x, test_x, train_y, test_y = train_test_split(train3.iloc[:,:-1],
train3.iloc[:,-1], train_size=0.8, random_state = 5)

In [36]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))


Multinomial Logistic regression Train Accuracy :  0.6421910566120861
Multinomial Logistic regression Test Accuracy :  0.6449718045112782


### Random Forest

In [33]:
rf = RandomForestClassifier(min_samples_leaf=8, n_estimators=60)
rf.fit(train_x, train_y)

print("Train Accuracy : ", metrics.accuracy_score(train_y, rf.predict(train_x)))
print("Test Accuracy : ", metrics.accuracy_score(test_y, rf.predict(test_x)))

Train Accuracy :  0.7380952380952381
Test Accuracy :  0.5517241379310345


### XgBoost

In [32]:
model = xgb.XGBClassifier()
param_dist = {"max_depth": [3,5,10],
              "min_child_weight" : [1,5,10],
              "learning_rate": [0.07, 0.1,0.2],
               }

# run randomized search
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=5, n_jobs=-1)
grid_search.fit(train_x, train_y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] learning_rate=0.07, min_child_weight=1, max_depth=3 .............
[CV] learning_rate=0.07, min_child_weight=1, max_depth=3 .............
[CV] learning_rate=0.07, min_child_weight=1, max_depth=3 .............
[CV] learning_rate=0.07, min_child_weight=5, max_depth=3 .............
[CV] learning_rate=0.07, min_child_weight=5, max_depth=3 .............
[CV] learning_rate=0.07, min_child_weight=5, max_depth=3 .............
[CV] learning_rate=0.07, min_child_weight=10, max_depth=3 ............
[CV] learning_rate=0.07, min_child_weight=10, max_depth=3 ............
[CV] learning_rate=0.07, min_child_weight=10, max_depth=3 ............
[CV] learning_rate=0.07, min_child_weight=1, max_depth=5 .............
[CV] learning_rate=0.07, min_child_weight=1, max_depth=5 .............
[CV] learning_rate=0.07, min_child_weight=1, max_depth=5 .............
[CV]  learning_rate=0.07, min_child_weight=1, max_depth=3, score=0.5961538461538461, to

[CV] learning_rate=0.2, min_child_weight=1, max_depth=3 ..............
[CV]  learning_rate=0.1, min_child_weight=5, max_depth=10, score=0.5286624203821656, total=   0.7s
[CV] learning_rate=0.2, min_child_weight=1, max_depth=3 ..............
[CV]  learning_rate=0.1, min_child_weight=10, max_depth=10, score=0.5095541401273885, total=   0.5s
[CV] learning_rate=0.2, min_child_weight=5, max_depth=3 ..............
[CV]  learning_rate=0.1, min_child_weight=1, max_depth=10, score=0.5605095541401274, total=   0.9s
[CV] learning_rate=0.2, min_child_weight=5, max_depth=3 ..............
[CV]  learning_rate=0.1, min_child_weight=10, max_depth=10, score=0.5641025641025641, total=   0.5s
[CV] learning_rate=0.2, min_child_weight=5, max_depth=3 ..............


[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   53.2s


[CV]  learning_rate=0.2, min_child_weight=5, max_depth=3, score=0.5897435897435898, total=   0.3s
[CV] learning_rate=0.2, min_child_weight=10, max_depth=3 .............
[CV] learning_rate=0.2, min_child_weight=1, max_depth=3 ..............
[CV]  learning_rate=0.2, min_child_weight=1, max_depth=3, score=0.5961538461538461, total=   0.4s
[CV] learning_rate=0.2, min_child_weight=10, max_depth=3 .............
[CV]  learning_rate=0.2, min_child_weight=5, max_depth=3, score=0.5159235668789809, total=   0.4s
[CV] learning_rate=0.2, min_child_weight=10, max_depth=3 .............
[CV]  learning_rate=0.2, min_child_weight=1, max_depth=3, score=0.5906040268456376, total=   0.4s
[CV] learning_rate=0.2, min_child_weight=1, max_depth=5 ..............
[CV]  learning_rate=0.2, min_child_weight=5, max_depth=3, score=0.5503355704697986, total=   0.4s
[CV] learning_rate=0.2, min_child_weight=1, max_depth=5 ..............
[CV]  learning_rate=0.1, min_child_weight=1, max_depth=10, score=0.5705128205128205,

[Parallel(n_jobs=-1)]: Done  75 out of  81 | elapsed:   55.3s remaining:    4.4s


KeyboardInterrupt: 

In [None]:
grid_search.best_estimator_

In [74]:
xg = xgb.XGBClassifier(max_depth=5)
xg.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, xg.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, xg.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.7062926793450327
Multinomial Logistic regression Test Accuracy :  0.685561560150376


In [69]:
xgb.XGBClassifier??