In [1]:
import pandas as pd
import numpy as np
import gensim
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
train = pd.read_csv('train.csv')
print(len(train))
train[0:10]

19579


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
5,id22965,"A youth passed in solitude, my best years spen...",MWS
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP
7,id13515,The surcingle hung in ribands from my body.,EAP
8,id19322,I knew that you could not say to yourself 'ste...,EAP
9,id00912,I confess that neither the structure of langua...,MWS


In [3]:
test = pd.read_csv('test.csv')
print(len(test))
test[0:10]

8392


Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...
5,id27337,"""The thick and peculiar mist, or smoke, which ..."
6,id24265,"That which is not matter, is not at all unless..."
7,id25917,I sought for repose although I did not hope fo...
8,id04951,"Upon the fourth day of the assassination, a pa..."
9,id14549,"""The tone metaphysical is also a good one."


In [5]:
label_enconder = preprocessing.LabelEncoder()
label_enconder.fit(train['author'])
train['label_encoded'] = label_enconder.transform(train['author'])
train[0:10]

Unnamed: 0,id,text,author,label_encoded
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1
5,id22965,"A youth passed in solitude, my best years spen...",MWS,2
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP,0
7,id13515,The surcingle hung in ribands from my body.,EAP,0
8,id19322,I knew that you could not say to yourself 'ste...,EAP,0
9,id00912,I confess that neither the structure of langua...,MWS,2


## Pre-processing text

In [6]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
   # filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [7]:
train['text_processed']=train['text'].apply(lambda x: transformText(x))
train

Unnamed: 0,id,text,author,label_encoded,text_processed
0,id26305,"This process, however, afforded me no means of...",EAP,0,process howev afford mean ascertain dimens dun...
1,id17569,It never once occurred to me that the fumbling...,HPL,1,never occur fumbl might mere mistak
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0,left hand gold snuff box which caper hill cut ...
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2,love spring look windsor terrac sixteen fertil...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1,find noth els even gold superintend abandon at...
5,id22965,"A youth passed in solitude, my best years spen...",MWS,2,youth pass solitud best year spent gentl femin...
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP,0,astronom perhap point took refug suggest non l...
7,id13515,The surcingle hung in ribands from my body.,EAP,0,surcingl hung riband bodi
8,id19322,I knew that you could not say to yourself 'ste...,EAP,0,knew could sai stereotomi without brought thin...
9,id00912,I confess that neither the structure of langua...,MWS,2,confess neither structur languag code govern p...


## Train_test split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(train['text_processed'], train['label_encoded'], test_size = 0.2, random_state = 4)
true_label = np.array(y_test)
print("#" * 20 + " Some stats " + "#"*20)
print("Dataset training: {} uterances".format(x_train.shape[0]))
print("Dataset testing: {} uterances".format(x_test.shape[0]))
print("Different classes: {}".format(len(y_train.unique())))

#################### Some stats ####################
Dataset training: 15663 uterances
Dataset testing: 3916 uterances
Different classes: 3


In [10]:
features_2 = TfidfVectorizer(max_df = 0.9, ngram_range = (1,2), norm = 'l2')

In [11]:
x_train_features_2 = features_2.fit_transform(x_train)
x_test_features_2 = features_2.transform(x_test)

In [12]:
model_sgd_2 = SGDClassifier(loss = 'modified_huber', penalty = 'l2',)
model_sgd_2.fit(x_train_features_2,y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', max_iter=5,
       n_iter=None, n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [13]:
preds_sgd_2 = model_sgd_2.predict(x_test_features_2)
preds_sgd_2_proba = model_sgd_2.predict_proba(x_test_features_2)

print("Current Accuracy: {0:.3f}".format(accuracy_score(preds_sgd_2,true_label)))
print("Log loss for this classifier {}".format(log_loss(true_label,preds_sgd_2_proba)))

Current Accuracy: 0.836
Log loss for this classifier 0.6697427187866234


In [14]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [15]:
log_loss(true_label,preds_sgd_2_proba)

0.66974271878662339

In [16]:
x_train.head()

9493    twice upon ventur express total incredul respe...
1919                      fire among crowd women children
1692    thought heard rat partit even paid littl atten...
2714                              it mere typograph error
9863    uttermost step led dread chamber larg fragment...
Name: text_processed, dtype: object

In [17]:
## model 3
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [18]:
tfv.fit(list(x_train) + list(x_test))
x_train_tfv =  tfv.transform(x_train) 
x_test_tfv = tfv.transform(x_test)
# -----
clf = LogisticRegression(C=1.0)
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_test_tfv)
print("logloss: %0.3f " % multiclass_logloss(true_label, predictions))

logloss: 0.598 


In [19]:
## Model 4 - testing with XGBoost
import xgboost as xgb



In [20]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)

In [21]:
clf.fit(x_train_tfv.tocsc(), y_train)
predictions = clf.predict_proba(x_test_tfv.tocsc())

print ("logloss: %0.3f " % log_loss(true_label, predictions))

logloss: 0.748 


In [22]:
## Loading Glove vectors
embeddings_index = {}
f = open('../../vectors/glove.42B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

1917495it [03:08, 10189.26it/s]

Found 1917495 word vectors.





In [26]:
embeddings_index['woman'][0:50]

array([ -1.90770000e-01,   6.50600016e-01,  -3.05849999e-01,
         3.18800002e-01,  -7.07940012e-02,   2.48410001e-01,
        -2.73169994e+00,  -4.37580012e-02,  -5.26859999e-01,
        -7.58560002e-01,  -1.15719996e-02,  -8.15320015e-01,
         6.35939986e-02,  -6.97100013e-02,  -1.55300006e-01,
        -3.90240014e-01,   6.99109972e-01,  -1.29740000e-01,
        -1.41599998e-01,   1.02559999e-01,   1.41190004e-03,
        -5.15209995e-02,   3.72379988e-01,  -6.20259997e-03,
         1.69630006e-01,   3.92159998e-01,  -3.15310001e-01,
        -4.25500005e-01,  -4.13399994e-01,  -8.79890025e-02,
        -4.49970007e-01,   3.94419990e-02,   5.83199978e-01,
         4.32000011e-01,   1.85460001e-01,   6.25000000e-01,
         4.14629988e-02,   5.63820004e-01,  -1.52940005e-01,
        -5.30759990e-02,   2.59259999e-01,   2.45179996e-01,
         1.74899995e-01,  -5.71019985e-02,  -1.26440004e-02,
         2.52829999e-01,   1.42220005e-01,  -1.61740005e-01,
        -1.93450004e-01,

In [29]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stops]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [30]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(x_train)]
xvalid_glove = [sent2vec(x) for x in tqdm(x_test)]


  0%|          | 0/15663 [00:00<?, ?it/s][A
  2%|▏         | 359/15663 [00:00<00:04, 3582.33it/s][A
  5%|▍         | 732/15663 [00:00<00:04, 3624.10it/s][A
  7%|▋         | 1094/15663 [00:00<00:04, 3622.08it/s][A
  9%|▉         | 1466/15663 [00:00<00:03, 3650.32it/s][A
 12%|█▏        | 1833/15663 [00:00<00:03, 3654.75it/s][A
 14%|█▍        | 2200/15663 [00:00<00:03, 3658.31it/s][A
 16%|█▋        | 2552/15663 [00:00<00:03, 3610.72it/s][A
 19%|█▊        | 2914/15663 [00:00<00:03, 3611.09it/s][A
 21%|██        | 3283/15663 [00:00<00:03, 3633.00it/s][A
 23%|██▎       | 3649/15663 [00:01<00:03, 3639.15it/s][A
 26%|██▌       | 4019/15663 [00:01<00:03, 3655.67it/s][A
 28%|██▊       | 4379/15663 [00:01<00:03, 3638.58it/s][A
 30%|███       | 4742/15663 [00:01<00:03, 3633.67it/s][A
 33%|███▎      | 5114/15663 [00:01<00:02, 3656.59it/s][A
 35%|███▍      | 5478/15663 [00:01<00:02, 3640.57it/s][A
 37%|███▋      | 5845/15663 [00:01<00:02, 3648.62it/s][A
 40%|███▉      | 6209/15663 

In [34]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [43]:
## Testing XGBoost with Glove Features
classi = xgb.XGBClassifier(nthread=10, silent=False)
classi.fit(xtrain_glove,y_train)
preds_proba = classi.predict_proba(xvalid_glove)
preds = classi.predict(xvalid_glove)

In [44]:
print("logloss: {}".format(multiclass_logloss(true_label, preds_proba)))
print("Current Accuracy: {0:.3f}".format(accuracy_score(preds,true_label)))

logloss: 0.8641022256384314
Current Accuracy: 0.620


In [47]:
## Testing XGBoost with Glove Features
classi = xgb.XGBClassifier(max_depth=10, n_estimators=400, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
classi.fit(xtrain_glove,y_train)
preds_proba = classi.predict_proba(xvalid_glove)
preds = classi.predict(xvalid_glove)

In [48]:
print("Log-loss: {0:.3f}".format(multiclass_logloss(true_label, preds_proba)))
print("Current Accuracy: {0:.3f}".format(accuracy_score(preds,true_label)))

Log-loss: 0.831
Current Accuracy: 0.671


In [58]:
## LSTM classifier
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization

In [59]:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [60]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(y_train)
yvalid_enc = np_utils.to_categorical(y_test)

In [89]:
# create 3 layer neural net
model = Sequential()
model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.6))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.6))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.6))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [90]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=20, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))

Train on 15663 samples, validate on 3916 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa136a22748>

In [88]:
preds_proba = model.predict_proba(xvalid_glove)
print("Log-loss: {0:.3f}".format(multiclass_logloss(true_label, preds_proba)))

Log-loss: 1.530


(3916, 3)

## Generate submission

In [None]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

In [None]:
def get_txt_proba_response(msg, vectors, model):
    msg_vec = vectors.transform([msg])
#    print(msg_vec)
    pred_prob=model.predict_proba(msg_vec)
    pd_unsorted = pd.DataFrame(
        {'label_encode': model.classes_,
         'label_decode': label_enconder.inverse_transform(model.classes_),
         'pred_proba':  pred_prob[0]})
    
    probas = {
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[0][0]:
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[0][2],
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[1][0]:
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[1][2],
         pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[2][0]:
         pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[2][2]}
    
    return probas

In [None]:
for i in range(len(test)):
    reply=get_txt_proba_response(transformText(test['text'][i]),features_2, model_sgd_2)
    my_sub.loc[i] = [test['id'][i], reply['EAP'], reply['HPL'], reply['MWS']]

In [None]:
my_sub

In [None]:
my_sub.to_csv('roberto.csv',index=False)

## Testing text classification with PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class  CNN_Text(nn.Module):
    
    def __init__(self, args):
        super(CNN_Text,self).__init__()
        self.args = args
        
        V = args.embed_num
        D = args.embed_dim
        C = args.class_num
        Ci = 1
        Co = args.kernel_num
        Ks = args.kernel_sizes

        self.embed = nn.Embedding(V, D)
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(args.dropout)
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3) #(N,Co,W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embed(x) # (N,W,D)
        if self.args.static:
            x = Variable(x)
        x = x.unsqueeze(1) 
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x) # (N,len(Ks)*Co)
        logit = self.fc1(x) # (N,C)
        return logit