In [46]:
import os
import re
import numpy as np
import pandas as pd
import math
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import operator
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [47]:
def glove_dict_generation():
    glove_dict = {}
    with open('glove.6B.100d.txt', 'rb') as word_corpus:
      for item in word_corpus:
        item = item.decode().split()
        word = item[0]
        vect = np.array(item[1:]).astype(np.float)
        if word not in glove_dict:
          glove_dict[word] = vect
        
    return glove_dict

In [48]:
glove_dict=glove_dict_generation()

In [49]:
def mean_embedding(x,dim,glove_dict):
    return np.array(np.sum([glove_dict[w] for w in x if w in glove_dict] or [np.zeros(dim)], axis=0))

In [50]:
def similarity(document,ending,glove_dict):
    document_vector=mean_embedding(document,100,glove_dict)
    ending_vector=mean_embedding(ending,100,glove_dict)
    result = 1 - spatial.distance.cosine(document_vector, ending_vector)
    return result

In [51]:
import nltk
nltk.download('vader_lexicon')
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\vigy\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Loading Data

In [52]:
path_train = 'train.csv'
path_val = 'dev.csv'
path_test = 'test.csv'
data_train = pd.read_csv(path_train)
data_val = pd.read_csv(path_val)
data_test =pd.read_csv(path_test)
data_test.head(2)

Unnamed: 0,InputStoryid,InputSentence1,InputSentence2,InputSentence3,InputSentence4,RandomFifthSentenceQuiz1,RandomFifthSentenceQuiz2
0,b929f263-1dcd-4a0b-b267-5d5ff2fe65bb,My friends all love to go to the club to dance.,They think it's a lot of fun and always invite.,I finally decided to tag along last Saturday.,I danced terribly and broke a friend's toe.,My friends decided to keep inviting me out as ...,"The next weekend, I was asked to please stay h..."
1,7cbbc0af-bcce-4f56-871d-963f9bb6a99d,I tried going to the park the other day.,The weather seemed nice enough for a walk.,Within minutes of getting there I started snee...,My eyes were watery and it was hard to breathe.,My allergies were too bad and I had to go back...,It reminded me of how much I loved spring flow...



### [Unk] word handling

In [53]:
# Used to add UNK to the training corpus
def unknown_words(corpus):
    for i in range(len(corpus)):
        toss = np.random.binomial(size=1, n=1, p= 0.01)
        if toss == 1:
            corpus[i] = 'UNK'
    
    return corpus

In [54]:
# Used to add UNK in validation and Testing corpus
def add_unknown_words(corpus, unigram_count):
    for i in range(len(corpus)):
        if corpus[i] not in unigram_count:
            corpus[i] = 'UNK'
    return corpus







### Fetching Data

In [55]:
def fetch_data(data, special_tokens, vocab_dict, training, validation, testing, lowercase):   # training, validation and testing are BOOLEAN values
    story = ''
    data_list = []
    label = []
    label_for_acc = []
    sentiment1 = []
    sentiment2= []
    similarity_vector= []
    
    for item in data.iterrows():

        if training == True:
            
            if special_tokens == None:
                story += item[1][1] + ' ' + item[1][2] + ' ' + item[1][3] + ' ' + item[1][4] + ' ' + item[1][5] + ' ' + item[1][5]
                story_body = item[1][1] + ' ' + item[1][2] + ' ' + item[1][3] + ' ' + item[1][4]
                storyend1  = story_body + ' ' + item[1][5] 
                storyend2 = story_body + ' ' + item[1][6]
                
                if lowercase == True:
                    story = story.lower()
                    storyend1 = storyend1.lower()
                    storyend2 = storyend2.lower()
                
                data_list.append(storyend1)
                data_list.append(storyend2)
                
            else:
                story_body = item[1][1] + ' ' + item[1][2] + ' ' + item[1][3] + ' ' + item[1][4]
                storyend1  = story_body + ' ' + item[1][5] 
                storyend2 = story_body + ' ' + item[1][6]
                sentiment1.append(sid.polarity_scores(story_body)['compound']-sid.polarity_scores(storyend1)['compound'])
                sentiment1.append(sid.polarity_scores(story_body)['compound']-sid.polarity_scores(storyend2)['compound'])
                similarity_vector.append(similarity(story_body,storyend1,glove_dict))
                similarity_vector.append(similarity(story_body,storyend2,glove_dict))

                if lowercase == True:
                    storyend1 = storyend1.lower()
                    storyend2 = storyend2.lower()

                storyend1 = storyend1.split()
                storyend2 = storyend2.split()

                se1 = ' '
                se2 = ' '

                for i, word in enumerate(storyend1):
                    if word in special_tokens:
                        storyend1[i] = 'UNK'

                for i, word in enumerate(storyend2):
                    if word in special_tokens:
                        storyend2[i] = 'UNK'

                se1 = se1.join(storyend1)
                se2 = se2.join(storyend2)

                data_list.append(se1)
                data_list.append(se2)

            
            if item[1][7] == 1:
                label.append(1)
                label.append(0)
            else:
                label.append(0)
                label.append(1)
        
        if validation == True:
            
            story_body = item[1][1] + ' ' + item[1][2] + ' ' + item[1][3] + ' ' + item[1][4]
            storyend1  = story_body + ' ' + item[1][5] 
            storyend2 = story_body + ' ' + item[1][6]
            sentiment1.append(sid.polarity_scores(story_body)['compound']-sid.polarity_scores(storyend1)['compound'])
            sentiment1.append(sid.polarity_scores(story_body)['compound']-sid.polarity_scores(storyend2)['compound'])
            similarity_vector.append(similarity(story_body,storyend1,glove_dict))
            similarity_vector.append(similarity(story_body,storyend2,glove_dict))

            if lowercase == True:
                storyend1 = storyend1.lower()
                storyend2 = storyend2.lower()

            
            storyend1 = storyend1.split()
            storyend2 = storyend2.split()

            se1 = ' '
            se2 = ' '
            
            for i in range(len(storyend1)):
                if storyend1[i] not in vocab_dict:
                    storyend1[i] = 'UNK'
            
            for i in range(len(storyend2)):
                if storyend2[i] not in vocab_dict:
                    storyend2[i] = 'UNK'

            se1 = se1.join(storyend1)
            se2 = se2.join(storyend2)

            data_list.append(se1)
            data_list.append(se2)
            
            
            label_for_acc.append(item[1][7])
            
        
            if item[1][7] == 1:
                label.append(1)
                label.append(0)
            else:
                label.append(0)
                label.append(1)
            
        if testing == True:
            
            story_body = item[1][1] + ' ' + item[1][2] + ' ' + item[1][3] + ' ' + item[1][4]
            storyend1  = story_body + ' ' + item[1][5] 
            storyend2 = story_body + ' ' + item[1][6]
            sentiment1.append(sid.polarity_scores(story_body)['compound']-sid.polarity_scores(storyend1)['compound'])
            sentiment1.append(sid.polarity_scores(story_body)['compound']-sid.polarity_scores(storyend2)['compound'])
            similarity_vector.append(similarity(story_body,storyend1,glove_dict))
            similarity_vector.append(similarity(story_body,storyend2,glove_dict))

            if lowercase == True:
                storyend1 = storyend1.lower()
                storyend2 = storyend2.lower()

            
            storyend1 = storyend1.split()
            storyend2 = storyend2.split()

            se1 = ' '
            se2 = ' '
            
            for i in range(len(storyend1)):
                if storyend1[i] not in vocab_dict:
                    storyend1[i] = 'UNK'
            
            for i in range(len(storyend2)):
                if storyend2[i] not in vocab_dict:
                    storyend2[i] = 'UNK'

            se1 = se1.join(storyend1)
            se2 = se2.join(storyend2)
            
            data_list.append(se1)
            data_list.append(se2)
            
        

    return story, data_list, label, label_for_acc,sentiment1,similarity_vector

### Unigram (Vocab count)

In [56]:
# O(n) complexity
def unigram(corpus):
    unigram_count = {}
    total_word_count = len(corpus)

    for item in corpus: 
        if (item in unigram_count):   
            unigram_count[item] += 1
        else: 
            unigram_count[item] = 1
    
    return unigram_count

### Find words that can be replaced with [UNK]

In [57]:
def replace_words_unk(vocab):
    limit = 2
    replacable_words = []
    for word in vocab:
        if vocab[word]<limit:
            replacable_words.append(word)
    
    return replacable_words

### Label Assign in acceptable format

In [58]:
def label_assign(ypred_val, prob_val):
    label = []
    for i in range(0, len(ypred_val), 2):
        if ypred_val[i] < ypred_val[i+1]:
            label.append(2)
        
        elif ypred_val[i] > ypred_val[i+1]:
            label.append(1)
        
        elif ypred_val[i] == 0 and ypred_val[i+1] == 0:
            if prob_val[i][ypred_val[i]]<prob_val[i+1][ypred_val[i+1]]:
                label.append(1)
            else:
                label.append(2)
        
        elif ypred_val[i] == 1 and ypred_val[i+1] == 1:
            if prob_val[i][ypred_val[i]]<prob_val[i+1][ypred_val[i+1]]:
                label.append(2)
            else:
                label.append(1)
                
       
    return label        
        

In [None]:
def main():
    
    classifier = {'Logistic Regression':True, 'Multinomial NB': False, 'Random Forest':False, 'SVC':False, 'Gradient Boosting':False, 'XG':False} 
        
    print('Loading Training data...')
    train_corpus, _, _, _, _, _= fetch_data(data_train, special_tokens=None, vocab_dict=None, training=True, validation=False, testing=False, lowercase=True)
    train_data_tokens = train_corpus.split()
    print('# of training tokens:', len(train_data_tokens))
    vocab_dict = unigram(train_data_tokens)
    print('Training vocab length:',len(vocab_dict))
    replace_words = replace_words_unk(vocab_dict)
    print('Replace # of words with UNK:', len(replace_words))
    _, train_data, train_label, _, train_sentiment1, train_similarity = fetch_data(data_train, special_tokens=replace_words, vocab_dict=None, training=True, validation=False, testing=False, lowercase=True)
    print('# of training documents:', int(len(train_data)/2))
    
    
    print('\n')
    
    print('Loading Validation Data...')
    _, val_data, val_label, val_label_check,valid_sentiment1, valid_similarity = fetch_data(data_val, special_tokens=None, vocab_dict=vocab_dict, training=False, validation=True, testing=False, lowercase=True)
    print('# of validation documents:',int(len(val_data)/2))
    
    # Training...
    vectorizer = CountVectorizer(ngram_range=(1,4), stop_words = None)
    features_X_train = vectorizer.fit_transform(train_data)
    print (type(features_X_train))
    train_sentiment1 = np.asarray(train_sentiment1)
    train_similarity = np.asarray(train_similarity)
    s=train_sentiment1.shape[0]
    train_sentiment1 = train_sentiment1.reshape(s, 1)
    train_similarity = train_similarity.reshape(s, 1)
    features_X_train=sparse.hstack((features_X_train,train_sentiment1,train_similarity))
    #features_X_train=np.hstack((train_sentiment1,train_similarity))
    word_types = vectorizer.get_feature_names()
    
    if classifier['Logistic Regression'] == True:
        clf = LogisticRegression(random_state=0)
    elif classifier['Multinomial NB'] == True:
        clf = MultinomialNB(alpha = 1, class_prior=None, fit_prior=True)
    elif classifier['Random Forest'] == True:
        clf = RandomForestClassifier()
    elif classifier['SVC'] == True:
        clf = SVC(probability=True)
    elif classifier['Gradient Boosting'] == True:
        clf = GradientBoostingClassifier()
    elif classifier['XG'] == True:
        clf = XGBClassifier()
        
    
    clf.fit(features_X_train, train_label)
    
    # Validation...
    features_X_valid = vectorizer.transform(val_data)
    valid_sentiment1 = np.asarray(valid_sentiment1)
    valid_similarity = np.asarray(valid_similarity)
    s=valid_sentiment1.shape[0]
    valid_sentiment1 = valid_sentiment1.reshape((s, 1))
    valid_similarity = valid_similarity.reshape((s, 1))
    features_X_valid=sparse.hstack((features_X_valid,valid_sentiment1,valid_similarity))
    #features_X_valid=np.hstack((valid_sentiment1,valid_similarity))

    
    ypred_val = clf.predict(features_X_valid)
    prob_val = clf.predict_proba(features_X_valid)
    prob_val = prob_val.tolist()
    
    y_result_val = label_assign(ypred_val, prob_val)
    acc_score = accuracy_score(val_label_check, y_result_val)
    print('Accuracy Score: %s'%(acc_score))
    _, test_data, test_label, test_label_check,test_sentiment1, test_similarity = fetch_data(data_test, special_tokens=None, vocab_dict=vocab_dict, training=False, validation=False, testing=True, lowercase=True)
    print('# of test documents:',int(len(test_data)/2))
    
    # Testing...
    features_X_test = vectorizer.transform(test_data)
    test_sentiment1 = np.asarray(test_sentiment1)
    test_similarity = np.asarray(test_similarity)
    s=test_sentiment1.shape[0]
    test_sentiment1 = test_sentiment1.reshape((s, 1))
    test_similarity = test_similarity.reshape((s, 1))
    features_X_test=sparse.hstack((features_X_test,test_sentiment1,test_similarity))
    #features_X_test=np.hstack((test_sentiment1,test_similarity))
    
    ypred_test = clf.predict(features_X_test)
    prob_test = clf.predict_proba(features_X_test)
    prob_test = prob_test.tolist()
    
    y_result_test = label_assign(ypred_test, prob_test)
    
    ids = data_test['InputStoryid']    
    output = pd.DataFrame({'Id': ids,
                            'Prediction': y_result_test})
    output.set_index('Id')
    path = 'result_partA.csv'
    output.to_csv(path, index = False)
    output.head(5)
    
    print('Test labels created in .csv format')
    
    
    
main()    

Loading Training data...
# of training tokens: 73572
Training vocab length: 9992
Replace # of words with UNK: 5620
