## P2 Response Type Classification in Discussions
**CSC791 - Fall 2020 term**       
**Shreyas Muralidhara (schikkb)**

In [36]:
#Identify the necessary libraries
import numpy as np
import pandas as pd
import xlrd
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
import re
import gensim
import spacy

from gensim.utils import simple_tokenize
from textblob import TextBlob
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC,LinearSVC

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Read training data and print the label distribution

In [2]:
#Import training data into the dataframe
input_df = pd.read_csv("p2_train.csv", index_col=None,header=0)
print(input_df.shape)

#printing label distribution of the input data
print('Label distribution for the training dataset:\n',input_df['type'].value_counts())

(1640, 10)
Label distribution for the training dataset:
 answered      994
attacked      299
irrelevant    286
agreed         61
Name: type, dtype: int64


### Read the testing data and print the label distribution

In [3]:
#Import training data into the dataframe
test_df = pd.read_csv("p2_test.csv", index_col=None,header=0)
print(test_df.shape)

#printing label distribution of the input data
print('Label distribution for the test dataset:\n',test_df['type'].value_counts())

(410, 10)
Label distribution for the test dataset:
 answered      320
attacked       39
irrelevant     38
agreed         13
Name: type, dtype: int64


### Baseline feature  set 1 
#### feature 1 - POS tagging
  1. Generate POS tags for the train dataset question and generate sentence with tags in "word/POStag" format.
  2. Generate POS tags for the train dataset response and generate sentence with tags in "word/POStag" format.

In [4]:
# Method to compute the POS tags for an input column

def generatePOS(sentence_list):
  sentlist_posTagged = []

  for sentence in sentence_list:
    word_tokens = nltk.word_tokenize(sentence)
    # Get the POS tags for the word tokens
    POS_word_tokens = nltk.pos_tag(word_tokens)
    
    # Concatenate word/pos_tag format
    sent_posTagged = ' '.join([entity[0]+"/"+entity[1] for entity in POS_word_tokens])    
    sentlist_posTagged.append(sent_posTagged)
  
  return pd.DataFrame(sentlist_posTagged)
  

# Store the POS tagges question and response into the dataframe

#Step 1 - Generating the POS tags for "question" and concatenate to word to form new sentences
input_df['questionPOS'] = generatePOS(input_df['question'])

#Step 2 - Generating the POS tags for "response" and concatenate to word to form new sentences
input_df['responsePOS'] = generatePOS(input_df['response'])

#### feature 2 - Doc2Vec Sentence embedding
  1. Generate sentence vectors for POS tagged question using Doc2Vec
  2. Generate sentence vectors for POS tagged response using Doc2vec
  3. Concatenate the vectors

In [5]:
# Method to compute the Sentence embeddings for Question and Response fields and combine them by concateation  
def Doc2Vec_QuestionResponse(question_list, response_list):
    # Step 1 - Generating tagged documents with list of words and their associated tags
    df_question_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(question_list)]
    df_response_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(response_list)]

    # Step 2a - Define the model and build the vocab for the question tags
    d2vmodel = Doc2Vec(min_count =1,vector_size=100, epochs=50)
    d2vmodel.build_vocab(df_question_tagged)

    df_question_vectors = []
    for d in df_question_tagged:  
        df_question_vectors.append(d2vmodel.infer_vector(d.words))
    df_question_vectors = pd.DataFrame(df_question_vectors)

    # Step 2b - Define the model and build the vocab for the response tags
    d2vmodel = Doc2Vec(min_count =1,vector_size=100, epochs=50)
    d2vmodel.build_vocab(df_response_tagged)
    df_response_vectors = []
    for d in df_response_tagged:  
        df_response_vectors.append(d2vmodel.infer_vector(d.words))
    df_response_vectors = pd.DataFrame(df_response_vectors)

    # Step 3 - Concatenate the 2 feature vectors into single feature vector
    df_questionResponse_vectors = np.concatenate((df_question_vectors, df_response_vectors), axis=1)
    
    return pd.DataFrame(df_questionResponse_vectors)


# Generate Concatenated vectors for Train dataset  
df_train_questionResponse_vectors = Doc2Vec_QuestionResponse(input_df['questionPOS'], input_df['responsePOS'])

#Split the input data into Training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(df_train_questionResponse_vectors, input_df['type'], test_size=0.2, shuffle = True, stratify = input_df['type'], random_state=0)
print(X_train.shape, X_val.shape,Y_train.shape, Y_val.shape)

(1312, 200) (328, 200) (1312,) (328,)


#### Train and validate the Baseline set 1 using SVM

In [7]:
# Step 4 - # Fit the model for the classifier
clf_doc2vec = SVC(max_iter =10000, C=10000).fit(X_train, Y_train)

# Predicting the class labels for validation data
Y_val_pred = clf_doc2vec.predict(X_val)

print('Baseline feature set 1 - doc2Vec validation metrics:\nAccuracy -',round(accuracy_score(Y_val,Y_val_pred),4))
print('f1 score -', round(f1_score(Y_val,Y_val_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(Y_val, Y_val_pred))

Baseline feature set 1 - doc2Vec validation metrics:
Accuracy - 0.561
f1 score - 0.4494
Classification Report:
               precision    recall  f1-score   support

      agreed       0.00      0.00      0.00        12
    answered       0.59      0.91      0.72       199
    attacked       0.00      0.00      0.00        60
  irrelevant       0.20      0.05      0.08        57

    accuracy                           0.56       328
   macro avg       0.20      0.24      0.20       328
weighted avg       0.39      0.56      0.45       328



  _warn_prf(average, modifier, msg_start, len(result))


### Testing  the Baseline Feature set 1 - 
  * feature 1 - POS tagging 
    1. Generate POS tags for the test dataset "question" and generate sentence with tags in "word/POStag" format.
    2. Generate POS tags for the test dataset "response" and generate sentence with tags in "word/POStag" format.

  * feature 2 - Doc2Vec Sentence embedding
    1. Generate the tagged document for the "question" and "response"
    2. Define model and build the vocabulary for "question" and "response"
    3. Generate the document vectors for test dataset 
    4. Concatenate the vector embeddings generated by Doc2Vec model.

In [9]:
### Feature 1 - Store the POS tagges question and response into the dataframe

#Step 1 - Generating the POS tags for "question" and concatenate to word to form new sentences
test_df['questionPOS'] = generatePOS(test_df['question'])

#Step 2 - Generating the POS tags for "response" and concatenate to word to form new sentences
test_df['responsePOS'] = generatePOS(test_df['response'])


### Feature 2 -  Generate Concatenated vectors for Test dataset  
df_test_questionResponse_vectors = Doc2Vec_QuestionResponse(test_df['questionPOS'], test_df['responsePOS'])

# Step 4 - Predicting the class labels for validation data
Y_test_pred = clf_doc2vec.predict(df_test_questionResponse_vectors)

print('Baseline feature set 1 - doc2Vec test metrics:\nAccuracy -',round(accuracy_score(test_df['type'],Y_test_pred),4))
print('f1 score -', round(f1_score(test_df['type'],Y_test_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(test_df['type'], Y_test_pred))

Baseline feature set 1 - doc2Vec test metrics:
Accuracy - 0.7317
f1 score - 0.6736
Classification Report:
               precision    recall  f1-score   support

      agreed       0.00      0.00      0.00        13
    answered       0.78      0.93      0.85       320
    attacked       0.07      0.03      0.04        39
  irrelevant       0.12      0.05      0.07        38

    accuracy                           0.73       410
   macro avg       0.24      0.25      0.24       410
weighted avg       0.63      0.73      0.67       410



  _warn_prf(average, modifier, msg_start, len(result))


### Baseline feature  set 2 
#### feature 1 - POS tagging
  1. Generate POS tags for the train dataset question and generate sentence with tags in "word/POStag" format.
  2. Generate POS tags for the train dataset response and generate sentence with tags in "word/POStag" format.

#### feature 2 - NER tagging
  1. Generate NER tags for the train dataset question and generate senetence with tages in "word/POStag/NERtag" format.
  2. Generate NER tags for the train dataset response and generate senetence with tages in "word/POStag/NERtag" format.

In [21]:
# Method to compute the POS tags for an input column

def generatePOS_NER(sentence_list):
  sentlist_pos_nerTagged = []

  for sentence in sentence_list:
    word_tokens = nltk.word_tokenize(sentence)
    # Get the POS tags for the word tokens
    POS_word_tokens = nltk.pos_tag(word_tokens)
    
    # Concatenate word/pos_tag format
    sent_posTagged = ' '.join([entity[0]+"/"+entity[1] for entity in POS_word_tokens])   


    # Compute the Named Entity recognition using Spacy
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sentence)
    NER_list, word_list = [], []

    for ent in doc.ents:
        # named Entity relation list generated for each word
        NER_list = NER_list + [ent.label_  for x in ent.text.split()]
        word_list = word_list + [x for x in ent.text.split()]

    cnt = 0
    POS_NERTagged = []
    for wordPOS in sent_posTagged.split():
        word = wordPOS.split('/')
        if word[0] in word_list and cnt < len(NER_list):
            POS_NERTagged.append(wordPOS + "/" + NER_list[cnt])
            cnt = cnt + 1
        else:
            POS_NERTagged.append(wordPOS + "/O")

    sentlist_pos_nerTagged.append(' '.join(POS_NERTagged))
    
  
  print(sentlist_pos_nerTagged)
  return pd.DataFrame(sentlist_pos_nerTagged)

# Store the POS tagges question and response into the dataframe

#Step 1 - Generating the POS tags for "question" and concatenate to word to form new sentences
input_df['questionPOS_NER'] = generatePOS_NER(input_df['question'])

#Step 2 - Generating the POS tags for "response" and concatenate to word to form new sentences
input_df['responsePOS_NER'] = generatePOS_NER(input_df['response'])



#### feature 3 - Sentiment analysis for a sentence
  1. For better normailzation convert sentence to lower case.
  2. Remove punctuationas and special characters
  3. Remove the stop words for better sentiment scores
  4. Stemming the words to root word using Porter stemmer
  5. Generate the polarity and subjectivity sentiment scores

#### feature 4 - Doc2Vec Sentence embedding
  1. Generate sentence vectors for POS_NER tagged question using Doc2Vec
  2. Generate sentence vectors for POS_NER tagged response using Doc2vec
  3. Concatenate the vectors and the 2 additional columns for sentiment analysis.

Concatenate the sentiment scores to vectors generated to generate the vectors with sentiment scores.


In [77]:
### feature 3 Sentimental analysis for a sentence

def generateSentiment(sentence_list):
    # Convert all the sentences to lower case as it is required for normalization
    sent_sentiment = sentence_list.apply(lambda x: " ".join(x.lower() for x in x.split()))
    # Remove the punctuations and special characters
    sent_sentiment = sent_sentiment.str.replace('[^\w\s]','')
    # Remove the stopwords for better sentiment score
    sent_sentiment = sent_sentiment.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords.words('english')))
    # Reduce the words to same root using porter stemmer
    st = PorterStemmer()
    sent_sentiment = sent_sentiment.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    # Generate the sentiment polarity and sentiment subjectivity for each sentence
    sent_sentiment = sent_sentiment.apply(lambda x: TextBlob(x).sentiment)

    return sent_sentiment


# Store the Sentiment score for question and response into the dataframe
#Step 1 - Generating the Polarity and subjectivity sentiment for "question"
question_sentiment = generateSentiment(input_df['question'])

#Step 2 - Generating the Polarity and subjectivity sentiment for "response"
response_sentiment = generateSentiment(input_df['response'])


### Feature 4 - Generate Concatenated vectors for Train dataset  
df_train_questionResponse_vectors = Doc2Vec_QuestionResponse(input_df['questionPOS_NER'], input_df['responsePOS_NER'])


# Concatenate the sentiment scores to the sentence embedding vectors to obtain the final vectors
df_train_questionResponse_sentvectors = np.column_stack((df_train_questionResponse_vectors, question_sentiment,response_sentiment))

#Split the input data into Training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(df_train_questionResponse_vectors, input_df['type'], test_size=0.2, shuffle = True, stratify = input_df['type'], random_state=0)
print(X_train.shape, X_val.shape,Y_train.shape, Y_val.shape)

(1312, 200) (328, 200) (1312,) (328,)


#### Train and validate the Baseline set 2 features using SVM

In [78]:
# Fit the model for the classifier
clf_doc2vec = SVC(max_iter =10000, C=10000).fit(X_train, Y_train)

# Predicting the class labels for validation data
Y_val_pred = clf_doc2vec.predict(X_val)

print('Baseline feature set 1 - doc2Vec validation metrics:\nAccuracy -',round(accuracy_score(Y_val,Y_val_pred),4))
print('f1 score -', round(f1_score(Y_val,Y_val_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(Y_val, Y_val_pred))

Baseline feature set 1 - doc2Vec validation metrics:
Accuracy - 0.5915
f1 score - 0.4778
Classification Report:
               precision    recall  f1-score   support

      agreed       0.00      0.00      0.00        12
    answered       0.61      0.95      0.75       199
    attacked       0.31      0.07      0.11        60
  irrelevant       0.14      0.02      0.03        57

    accuracy                           0.59       328
   macro avg       0.27      0.26      0.22       328
weighted avg       0.45      0.59      0.48       328



  _warn_prf(average, modifier, msg_start, len(result))


### Testing  the Baseline Feature set 2 - 
  * feature 1 - POS tagging
    1. Generate POS tags for the test dataset question and generate sentence with tags in "word/POStag" format.
    2. Generate POS tags for the test dataset response and generate sentence with tags in "word/POStag" format.

  * feature 2 - NER tagging(Named Entity Recognition)
    1. Generate NER tags for the test dataset question and generate senetence with tages in "word/POStag/NERtag" format.
    2. Generate NER tags for the test dataset response and generate senetence with tages in "word/POStag/NERtag" format.

  * feature 3 - Sentiment analysis for a sentence
    1. For better normailzation convert sentence to lower case.
    2. Remove punctuationas and special characters
    3. Remove the stop words for better sentiment scores
    4. Stemming the words to root word using Porter stemmer
    5. Generate the polarity and subjectivity sentiment scores

  * feature 4 - Doc2Vec Sentence embedding
    1. Generate sentence vectors for POS_NER tagged question using Doc2Vec
    2. Generate sentence vectors for POS_NER tagged response using Doc2vec
    3. Concatenate the vectors and the 2 additional columns for sentiment analysis.

Concatenate the sentiment scores to vectors generated to generate the vectors with sentiment scores.

In [81]:
### Feature 1 & 2 - Store the POS tagged & NER Tagged question and response into the dataframe

#Step 1 - Generating the POS tags & NER tags for "question" and concatenate to word to form new sentences
test_df['questionPOS_NER'] = generatePOS_NER(test_df['question'])

#Step 2 - Generating the POS tags & NER tags for "response" and concatenate to word to form new sentences
test_df['responsePOS_NER'] = generatePOS_NER(test_df['response'])

# Feature 3 - Store the Sentiment score for question and response into the dataframe
#Step 1 - Generating the Polarity and subjectivity sentiment for "question"
question_sentiment = generateSentiment(test_df['question'])

#Step 2 - Generating the Polarity and subjectivity sentiment for "response"
response_sentiment = generateSentiment(test_df['response'])

### Feature 4 -  Generate Concatenated vectors for Test dataset  
df_test_questionResponse_vectors = Doc2Vec_QuestionResponse(test_df['questionPOS_NER'], test_df['responsePOS_NER'])

# Concatenate the sentiment scores to the sentence embedding vectors to obtain the final vectors
df_test_questionResponse_sentvectors = np.column_stack((df_test_questionResponse_vectors, question_sentiment,response_sentiment))

# Predicting the class labels for validation data
Y_test_pred = clf_doc2vec.predict(df_test_questionResponse_vectors)

print('Baseline feature set 1 - doc2Vec test metrics:\nAccuracy -',round(accuracy_score(test_df['type'],Y_test_pred),4))
print('f1 score -', round(f1_score(test_df['type'],Y_test_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(test_df['type'], Y_test_pred))

["Now/RB/O if/IN/O Julie/NNP/PERSON was/VBD/O underage/JJ/O (/(/O let/VB/O 's/POS/O say/VB/O you/PRP/O and/CC/O Julie/NNP/PERSON are/VBP/O both/DT/O in/IN/O high/JJ/O school/NN/O )/)/O but/CC/O Bob/NNP/PERSON was/VBD/O in/IN/O his/PRP$/O 40/CD/CARDINAL 's/POS/O ,/,/O you/PRP/O would/MD/O obviously/RB/O try/VB/O to/TO/O break/VB/O this/DT/O couple/NN/O up/RB/O ,/,/O right/RB/O ?/./O", 'Did/NNP/O the/DT/O protests/NNS/O motivate/VBP/O the/DT/O military/NN/O to/TO/O act/VB/O ?/./O', 'Can/MD/O you/PRP/O illustrate/VB/O how/WRB/O a/DT/O single/JJ/O class/NN/O in/IN/O CS/NNP/ORG would/MD/O be/VB/O incredibly/RB/O useful/JJ/O in/IN/O a/DT/O whole/JJ/O range/NN/O of/IN/O things/NNS/O ?/./O', "Did/NNP/O n't/RB/O I/PRP/O even/RB/O say/VBP/O ,/,/O that/IN/O I/PRP/O am/VBP/O completely/RB/O and/CC/O 100/CD/O %/NN/O okay/NN/O with/IN/O househusbands/NNS/O and/CC/O female/JJ/O soldiers/NNS/O ?/./O", 'However/RB/O if/IN/O we/PRP/O forget/VBP/O that/DT/O debate/NN/O for/IN/O a/DT/TIME minute/NN/TIME a

  _warn_prf(average, modifier, msg_start, len(result))
