## **Implementation of stance detection problem using word2vec vectorization and SVM model**

---





In [119]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [120]:
import nltk
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
import string
import re
import numpy as np
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate as cross_validation, ShuffleSplit, cross_val_score
import nltk
import warnings
warnings.filterwarnings("ignore")

In [122]:
def simplify(word):
	dump = ''
	temp = []
	listOfWords = list(filter(None,re.split("([A-Z][^A-Z]*)",word)))
	if len(listOfWords) == len(word):
		return word.lower()
	for i in range(len(listOfWords)):
		listOfWords[i] = listOfWords[i].lower()
		if len(listOfWords[i]) == 1:
			dump = dump + listOfWords[i]
			if dump in words.words() and len(dump) > 2:
				temp.append(dump)
				dump = ''
		else:
			temp.append(listOfWords[i])
	return temp

def createTokens(data):
  listOfTweets = []
  listOfStances = []
  tweetVector = []
  
  for ind, row in data.iterrows():
    
    # Create a sentence using target and the tweet. Word vector will be formed from this.
    example_sentence = str(row["Target"]) + " " + str(row["Tweet"])
    
    # Remove punctuation
    final_sentence = example_sentence.translate(string.punctuation)
    wordList = word_tokenize(final_sentence)
    wordList = [w.lower() for w in wordList]
    
    stop_words = set(stopwords.words('english'))
    wordList = [w for w in wordList if not w in stop_words] 
    
    finalList = []
    s = ' '.join([i for i in wordList if i.isalpha()])
    
    # create tokens from the string and stem them
    wordList = word_tokenize(s)
    
    for word in wordList:
      #to break any combined word into its components for eg, hashtags
      finalList += simplify(word)
    
    final_sentence = ' '.join(finalList)
    listOfTweets.append(word_tokenize(final_sentence))
  return listOfTweets
 

def labelStance(labelDict, data):
	for key, val in labelDict.items():
		data.loc[data["Stance"] == val, "Stance"] = int(key)
	return data

def createVector(data, model):
  data = createTokens(data)
  tweetVector = []
  vector = np.zeros_like(model["abortion"])
  for tweet in data:
    for word in tweet:
      vector = vector + model[word]
    tweetVector.append(vector)
  tweetVector = np.asarray(tweetVector)
  return tweetVector

def getStances(data):
  stanceList = []
  for ind, row in data.iterrows():
    stanceList.append(row["Stance"])
  return np.asarray(stanceList)
    

trainTweets = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/stance-detection/train.csv').dropna(axis=1)
testTweets = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/stance-detection/test.csv').dropna(axis=1)

labelDict = {0:"AGAINST", 1:"FAVOR", 2:"NONE"}
trainTweets = labelStance(labelDict, trainTweets)
testTweets = labelStance(labelDict, testTweets)   

listOfTweets = pd.concat([trainTweets, testTweets], axis=0)
listOfTweets = createTokens(listOfTweets)

model = Word2Vec(listOfTweets, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

uniqTrainTargets = ['Atheism', 'Climate Change is a Real Concern', 'Feminist Movement','Hillary Clinton','Legalization of Abortion']

totalAcc = 0
for target in uniqTrainTargets:
  trainTweetVectors = createVector(trainTweets[trainTweets["Target"]==target], model)
  testTweetVectors = createVector(testTweets[testTweets["Target"]==target], model)
  trainStances = getStances(trainTweets[trainTweets["Target"]==target])
  testStances = getStances(testTweets[testTweets["Target"]==target])
  

  print("Processing the input for targrt " + str(target) + ".....")

  clf = SVC(kernel="rbf").fit(trainTweetVectors, trainStances)
  acc = clf.score(testTweetVectors, testStances)

  print("Test Accuracy is "+ str(round(acc*100,2)) +"%")
  
  totalAcc = totalAcc + acc
  
totalAcc = totalAcc/len(uniqTrainTargets)
print("Total Test Accuracy is "+ str(round(totalAcc*100,2)) +"%")

Processing the input for targrt Atheism.....
Test Accuracy is 67.14%
Processing the input for targrt Climate Change is a Real Concern.....
Test Accuracy is 72.02%
Processing the input for targrt Feminist Movement.....
Test Accuracy is 53.52%
Processing the input for targrt Hillary Clinton.....
Test Accuracy is 57.48%
Processing the input for targrt Legalization of Abortion.....
Test Accuracy is 62.59%
Total Test Accuracy is 62.55%
