## **Implementation of stance detection problem using word2vec vectorization and baseline model**

---





In [46]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [47]:
import nltk
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
import string
import re
import numpy as np
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate as cross_validation, ShuffleSplit, cross_val_score
import nltk
import warnings
warnings.filterwarnings("ignore")

In [49]:
def simplify(word):
	dump = ''
	temp = []
	listOfWords = list(filter(None,re.split("([A-Z][^A-Z]*)",word)))
	if len(listOfWords) == len(word):
		return word.lower()
	for i in range(len(listOfWords)):
		listOfWords[i] = listOfWords[i].lower()
		if len(listOfWords[i]) == 1:
			dump = dump + listOfWords[i]
			if dump in words.words() and len(dump) > 2:
				temp.append(dump)
				dump = ''
		else:
			temp.append(listOfWords[i])
	return temp

def createTokens(data):
  listOfTweets = []
  listOfStances = []
  tweetVector = []
  
  for ind, row in data.iterrows():
    
    # Create a sentence using target and the tweet. Word vector will be formed from this.
    example_sentence = str(row["Target"]) + " " + str(row["Tweet"])
    
    # Remove punctuation
    final_sentence = example_sentence.translate(string.punctuation)
    wordList = word_tokenize(final_sentence)
    wordList = [w.lower() for w in wordList]
    
    stop_words = set(stopwords.words('english'))
    wordList = [w for w in wordList if not w in stop_words] 
    
    finalList = []
    s = ' '.join([i for i in wordList if i.isalpha()])
    
    # create tokens from the string and stem them
    wordList = word_tokenize(s)
    
    for word in wordList:
      #to break any combined word into its components for eg, hashtags
      finalList += simplify(word)
    
    final_sentence = ' '.join(finalList)
    listOfTweets.append(word_tokenize(final_sentence))
  return listOfTweets
 

def labelStance(labelDict, data):
	for key, val in labelDict.items():
		data.loc[data["Stance"] == val, "Stance"] = int(key)
	return data

def createVector(data, model):
  data = createTokens(data)
  tweetVector = []
  vector = np.zeros_like(model["abortion"])
  for tweet in data:
    for word in tweet:
      vector = vector + model[word]
    tweetVector.append(vector)
  tweetVector = np.asarray(tweetVector)
  return tweetVector

def getStances(data):
  stanceList = []
  for ind, row in data.iterrows():
    stanceList.append(row["Stance"])
  return np.asarray(stanceList)
    
  
# 'Support Vector Machine', 'Random Forest Classifier', 'Gradient Boosting Classifier', 'Logistic Regression', 'Neural Network'
classifier = 'Neural Network'

trainTweets = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/stance-detection/training.txt', sep='\t',header=0,encoding='utf-8')
testTweets = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/stance-detection/test-gold.txt', sep='\t',header=0,encoding='utf-8')

labelDict = {0:"AGAINST", 1:"FAVOR", 2:"NONE"}
trainTweets = labelStance(labelDict, trainTweets)
testTweets = labelStance(labelDict, testTweets)   

print("Processing the dataset...........")
listOfTweets = pd.concat([trainTweets, testTweets], axis=0)
listOfTweets = createTokens(listOfTweets)
uniqTrainTargets = trainTweets.Target.unique()

print("Processing the word2vec model...........")
model = Word2Vec(listOfTweets, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

print("\nThe Machine Learning Model Used For Classification : " + classifier)
totalAcc = 0
for target in uniqTrainTargets:
  trainTweetVectors = createVector(trainTweets[trainTweets["Target"]==target], model)
  testTweetVectors = createVector(testTweets[testTweets["Target"]==target], model)
  trainStances = getStances(trainTweets[trainTweets["Target"]==target])
  testStances = getStances(testTweets[testTweets["Target"]==target])
  

  print("\nProcessing the input for targrt " + str(target) + ".....")
  print("Training Sample size : " + str(trainTweetVectors.shape[0]))
  print("Testing Sample size : " + str(testTweetVectors.shape[0]))
  
  if classifier == 'Support Vector Machine':
    clf = SVC(kernel="rbf").fit(trainTweetVectors, trainStances)
    acc = clf.score(testTweetVectors, testStances)
    print("Test Accuracy is "+ str(round(acc*100,2)) +"%")
    totalAcc = totalAcc + acc
    
  elif classifier == 'Random Forest Classifier':
    clf = RandomForestClassifier(n_estimators=90).fit(trainTweetVectors, trainStances)
    acc = clf.score(testTweetVectors, testStances)
    print("Test Accuracy is "+ str(round(acc*100,2)) +"%")
    totalAcc = totalAcc + acc
    
  elif classifier == 'Gradient Boosting Classifier':
    clf = GradientBoostingClassifier().fit(trainTweetVectors, trainStances)
    acc = clf.score(testTweetVectors, testStances)
    print("Test Accuracy is "+ str(round(acc*100,2)) +"%")
    totalAcc = totalAcc + acc
    
  elif classifier == 'Logistic Regression':
    clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(trainTweetVectors, trainStances)
    acc = clf.score(testTweetVectors, testStances)
    print("Test Accuracy is "+ str(round(acc*100,2)) +"%")
    totalAcc = totalAcc + acc
    
  elif classifier == 'Neural Network':
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1).fit(trainTweetVectors, trainStances)
    acc = clf.score(testTweetVectors, testStances)
    print("Test Accuracy is "+ str(round(acc*100,2)) +"%")
    totalAcc = totalAcc + acc
    
totalAcc = totalAcc/len(uniqTrainTargets)
print("\nTotal Test Accuracy is "+ str(round(totalAcc*100,2)) +"%")

Processing the dataset...........
Processing the word2vec model...........

The Machine Learning Model Used For Classification : Neural Network

Processing the input for targrt Atheism.....
Training Sample size : 513
Testing Sample size : 220
Test Accuracy is 72.73%

Processing the input for targrt Climate Change is a Real Concern.....
Training Sample size : 395
Testing Sample size : 169
Test Accuracy is 72.78%

Processing the input for targrt Feminist Movement.....
Training Sample size : 664
Testing Sample size : 285
Test Accuracy is 64.21%

Processing the input for targrt Hillary Clinton.....
Training Sample size : 639
Testing Sample size : 295
Test Accuracy is 58.31%

Processing the input for targrt Legalization of Abortion.....
Training Sample size : 603
Testing Sample size : 280
Test Accuracy is 67.5%

Total Test Accuracy is 67.1%
