In [1]:
import itertools
import numpy as np
import pandas as pd
import nltk

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

## Modelling

In [2]:
train = pd.read_pickle('train_df.pkl')
test = pd.read_pickle('test_df.pkl')

In [3]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['pos_tags'].values.tolist(), 
                                                           s['tag'].values.tolist())]
        self.grouped = self.data.groupby('Sent_ID').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [4]:
getter = SentenceGetter(train)
sentences = getter.sentences

In [5]:
len(sentences)

191282

In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

In [7]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

### Split train and test sets

In [8]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

### Train a CRF model

In [9]:
final_crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.003,
    c2=0.11,
    max_iterations=100,
    all_possible_transitions=True
)
final_crf.fit(X, y)

test['tag'] = ''
getter = SentenceGetter(test)
test_sentences = getter.sentences
test_X = [sent2features(s) for s in test_sentences]

preds = final_crf.predict(test_X)
preds_list = list(itertools.chain.from_iterable(preds))
submission_df = pd.DataFrame()
submission_df['id'] = test['id']
submission_df['Sent_ID'] = test['Sent_ID']
submission_df['tag'] = preds_list
submission_df.to_csv('submission_2.csv', index=False)

NameError: name 'crf' is not defined

In [10]:
preds = final_crf.predict(test_X)
preds_list = list(itertools.chain.from_iterable(preds))
submission_df = pd.DataFrame()
submission_df['id'] = test['id']
submission_df['Sent_ID'] = test['Sent_ID']
submission_df['tag'] = preds_list
submission_df.to_csv('submission_2.csv', index=False)

In [11]:
final_crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
final_crf.fit(X, y)

preds = final_crf.predict(test_X)
preds_list = list(itertools.chain.from_iterable(preds))
submission_df = pd.DataFrame()
submission_df['id'] = test['id']
submission_df['Sent_ID'] = test['Sent_ID']
submission_df['tag'] = preds_list
submission_df.to_csv('submission_3.csv', index=False)

In [13]:
final_crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.003,
    c2=0.11,
    max_iterations=200,
    all_possible_transitions=True
)
final_crf.fit(X, y)

preds = final_crf.predict(test_X)
preds_list = list(itertools.chain.from_iterable(preds))
submission_df = pd.DataFrame()
submission_df['id'] = test['id']
submission_df['Sent_ID'] = test['Sent_ID']
submission_df['tag'] = preds_list
submission_df.to_csv('submission_4.csv', index=False)

In [14]:
import eli5
eli5.show_weights(final_crf, top=10)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


From \ To,O,B-indications,I-indications
O,2.29,-0.006,-12.21
B-indications,-2.806,-1.641,1.472
I-indications,-1.472,-2.458,1.046

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+7.440,word.lower():healthy,
+6.505,word.lower():stroma,
+6.436,word.lower():analysis,
+6.337,word.lower():pathogenesis,
+5.939,word.lower():apoptosis,
… 118403 more positive …,… 118403 more positive …,
… 12858 more negative …,… 12858 more negative …,
-6.282,word.lower():ulcers,
-6.287,word.lower():bladder,
-6.367,word.lower():ascites,

Weight?,Feature
+7.440,word.lower():healthy
+6.505,word.lower():stroma
+6.436,word.lower():analysis
+6.337,word.lower():pathogenesis
+5.939,word.lower():apoptosis
… 118403 more positive …,… 118403 more positive …
… 12858 more negative …,… 12858 more negative …
-6.282,word.lower():ulcers
-6.287,word.lower():bladder
-6.367,word.lower():ascites

Weight?,Feature
+10.771,word.lower():evaluable
+10.288,word.lower():degenerate
+10.107,word.lower():lysed
+10.065,word.lower():preconditioning
+10.016,word.lower():poland
+9.881,word.lower():otitis-prone
+9.589,word.lower():self-efficacy
+9.413,word.lower():lytic
+9.217,word.lower():excitable
+9.199,word.lower():premalignant

Weight?,Feature
+6.455,word.lower():haemorrhage
+5.919,word.lower():erythematosus
+5.822,-1:word.lower():learning
+5.586,word.lower():burnout
+5.212,word.lower():repair-deficient
+4.940,word.lower():abscess
+4.834,word.lower():ischemia/reperfusion
+4.731,word.lower():effusions
+4.703,word.lower():neovascularization
+4.659,word.lower():hypertrophic


In [17]:
eli5.show_weights(final_crf, top=50, 
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
+7.440,word.lower():healthy
+6.505,word.lower():stroma
+6.436,word.lower():analysis
+6.337,word.lower():pathogenesis
+5.939,word.lower():apoptosis
+5.762,word.lower():metabolism
+5.715,word.lower():diagnosis
+5.601,word.lower():prognosis
+5.373,BOS
+5.360,word.lower():synthesis

Weight?,Feature
+10.771,word.lower():evaluable
+10.288,word.lower():degenerate
+10.107,word.lower():lysed
+10.065,word.lower():preconditioning
+10.016,word.lower():poland
+9.881,word.lower():otitis-prone
+9.589,word.lower():self-efficacy
+9.413,word.lower():lytic
+9.217,word.lower():excitable
+9.199,word.lower():premalignant

Weight?,Feature
+6.455,word.lower():haemorrhage
+5.919,word.lower():erythematosus
+5.822,-1:word.lower():learning
+5.586,word.lower():burnout
+5.212,word.lower():repair-deficient
+4.940,word.lower():abscess
+4.834,word.lower():ischemia/reperfusion
+4.731,word.lower():effusions
+4.703,word.lower():neovascularization
+4.659,word.lower():hypertrophic


In [18]:
final_crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.002,
    c2=0.15,
    max_iterations=100,
    all_possible_transitions=True
)
final_crf.fit(X, y)

preds = final_crf.predict(test_X)
preds_list = list(itertools.chain.from_iterable(preds))
submission_df = pd.DataFrame()
submission_df['id'] = test['id']
submission_df['Sent_ID'] = test['Sent_ID']
submission_df['tag'] = preds_list
submission_df.to_csv('submission_5.csv', index=False)

In [20]:
eli5.show_weights(final_crf, top=100)

From \ To,O,B-indications,I-indications
O,2.301,0.017,-10.788
B-indications,-2.597,-1.833,1.353
I-indications,-1.48,-2.693,1.079

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+6.711,word.lower():healthy,
+6.510,word.lower():diagnosis,
+6.074,word.lower():analysis,
+5.615,word.lower():pathogenesis,
+5.458,word.lower():stroma,
+5.347,word.lower():apoptosis,
+5.090,word.lower():metabolism,
+5.006,word.lower():prognosis,
+4.882,word.lower():synthesis,
+4.759,BOS,

Weight?,Feature
+6.711,word.lower():healthy
+6.510,word.lower():diagnosis
+6.074,word.lower():analysis
+5.615,word.lower():pathogenesis
+5.458,word.lower():stroma
+5.347,word.lower():apoptosis
+5.090,word.lower():metabolism
+5.006,word.lower():prognosis
+4.882,word.lower():synthesis
+4.759,BOS

Weight?,Feature
+10.257,word.lower():evaluable
+9.777,word.lower():preconditioning
+9.459,word.lower():poland
+9.442,word.lower():lysed
+9.140,word.lower():degenerate
+9.101,word.lower():otitis-prone
+9.081,word.lower():lytic
+8.841,word.lower():self-efficacy
+8.731,word.lower():aspirated
+8.514,word.lower():premalignant

Weight?,Feature
+5.492,word.lower():haemorrhage
+5.277,-1:word.lower():learning
+5.112,word.lower():diagnosis
+5.053,word.lower():repair-deficient
+4.971,word.lower():burnout
+4.968,word.lower():erythematosus
+4.278,word.lower():abscess
+4.254,word.lower():effusions
+4.110,word.lower():hypertrophic
+4.039,-1:word.lower():methicillin-resistant


## Gensim Word2Vec

In [22]:
all_sentences = sentences + test_sentences

In [23]:
len(all_sentences)

317122

In [27]:
from gensim.models.word2vec import Word2Vec

In [28]:
model = Word2Vec(all_sentences)

TypeError: can only concatenate tuple (not "str") to tuple

In [29]:
all_sentences[0]

[('Obesity', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Low-', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('Middle-Income', 'JJ', 'O'),
 ('Countries', 'NNS', 'O'),
 (':', ':', 'O'),
 ('Burden', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Drivers', 'NNP', 'O'),
 (',', ',', 'O'),
 ('and', 'CC', 'O'),
 ('Emerging', 'NNP', 'O'),
 ('Challenges', 'NNP', 'O'),
 ('.', '.', 'O')]

In [31]:
from nltk.corpus import stopwords

In [32]:
stop = set(stopwords.words('english'))

In [37]:
import string

In [38]:
all_sentences_str = []

for sent in all_sentences:
    sent_list = []
    for tup in sent:
        w = tup[0].lower()
        if w not in stop and w not in string.punctuation:
            sent_list.append(w)
        
    all_sentences_str.append(sent_list)

In [34]:
len(all_sentences_str)

317122

In [35]:
len(all_sentences)

317122

In [39]:
all_sentences_str[0]

['obesity',
 'low-',
 'middle-income',
 'countries',
 'burden',
 'drivers',
 'emerging',
 'challenges']

In [40]:
model = Word2Vec(all_sentences_str)

In [43]:
model.wv.vocab['obesity']

<gensim.models.keyedvectors.Vocab at 0x23b8d8198>

In [47]:
X = model[model.wv.vocab]
from sklearn import cluster
from sklearn import metrics
kmeans = cluster.KMeans(n_clusters=4)
kmeans.fit(X)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
print ("Centroids data")
print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
print ("Silhouette_score: ")
print (silhouette_score)

  """Entry point for launching an IPython kernel.


Cluster id labels for inputted data
[1 0 0 ... 0 0 0]
Centroids data
[[-6.09064326e-02  6.93934225e-03  8.18303227e-02  9.59293246e-02
  -4.17350903e-02  3.37703433e-03 -1.11806765e-01 -8.21769703e-03
  -6.58003464e-02  1.34621575e-01 -6.99720457e-02 -5.32562360e-02
   9.91727263e-02  6.21661097e-02  3.88370678e-02  7.88711980e-02
   2.39928663e-02  2.36820392e-02 -1.31008010e-02  5.02174161e-02
  -2.84048636e-02  1.78205937e-01  5.11017703e-02  7.26875737e-02
  -1.58423968e-02 -8.88578780e-03  3.50871533e-02 -7.03259325e-03
   1.18742481e-01 -1.08677400e-02  7.39399791e-02 -2.05799565e-02
  -5.52190468e-05 -5.81533276e-02 -9.96536948e-03 -4.37295362e-02
  -2.34889369e-02  3.42863724e-02  8.18537101e-02  6.52742162e-02
   2.73405574e-02 -8.83256719e-02 -2.34338250e-02  5.30511364e-02
  -3.05720270e-02  5.62617891e-02 -7.55729526e-02  2.54525319e-02
   1.21359222e-01 -2.31589973e-02 -3.31633687e-02 -3.26177701e-02
   6.90273643e-02 -1.10205337e-01 -1.23245403e-01  1.59133703e-01
   6.55

In [48]:
len(labels)

50227

In [49]:
len(X)

50227

In [56]:
word_clusters = {}
ctr = 0
for word, vocab_obj in model.wv.vocab.items():
    word_clusters[word] = kmeans.labels_[ctr]
    ctr += 1

In [59]:
def word2features2(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    cluster = str(word_clusters.get(word.lower(), '4'))
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'cluster': cluster
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

In [60]:
def sent2features2(sent):
    return [word2features2(sent, i) for i in range(len(sent))]

In [61]:
X = [sent2features2(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [62]:
final_crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.003,
    c2=0.11,
    max_iterations=100,
    all_possible_transitions=True
)
final_crf.fit(X, y)

test['tag'] = ''
getter = SentenceGetter(test)
test_sentences = getter.sentences
test_X = [sent2features2(s) for s in test_sentences]

preds = final_crf.predict(test_X)
preds_list = list(itertools.chain.from_iterable(preds))
submission_df = pd.DataFrame()
submission_df['id'] = test['id']
submission_df['Sent_ID'] = test['Sent_ID']
submission_df['tag'] = preds_list
submission_df.to_csv('submission_6.csv', index=False)

In [63]:
c = {}

for word, cluster in word_clusters.items():
    try:
        c[cluster].append(word)
    except:
        c[cluster]= [word]

In [66]:
c[2]

['excess',
 'region',
 'differential',
 'occurs',
 'changes',
 'activity',
 'hypothesized',
 'leading',
 'stress',
 'dysregulation',
 'mechanisms',
 'reverse',
 'also',
 'contrast',
 'reactivity',
 'reactive',
 'thermal',
 'expressed',
 'conductance',
 'function',
 'simultaneous',
 'showed',
 'moreover',
 'signal',
 'action',
 'frog',
 'putative',
 'chloride',
 'channel',
 'single',
 'c-terminal',
 'transmembrane',
 'proteins',
 'plasma',
 'membrane',
 'cl',
 'channels',
 'could',
 'activated',
 'calcium',
 'human',
 'protein',
 'precursor',
 'n-terminal',
 'sequence',
 'cleavage',
 'near',
 'amino',
 'acid',
 'geometry',
 'suggested',
 'derived',
 'architecture',
 'hydrophobic',
 'antibody',
 'investigated',
 'synthesis',
 'localization',
 'maturation',
 'cell',
 'surface',
 'h',
 'revealed',
 'endoplasmic',
 'reticulum',
 'products',
 'detected',
 'product',
 'medium',
 'whereas',
 'retained',
 'demonstrated',
 'preferential',
 'release',
 'transfer',
 'secreted',
 'form',
 'green',
