In [25]:
import csv
import re
import codecs

import numpy as np

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

In [26]:
TRAIN_DATA = "./GOLD/Subtask_A/twitter-2013train-A.txt"
TEST_DATA = "./GOLD/Subtask_A/twitter-2013test-A.txt"
DEV_DATA = "./GOLD/Subtask_A/twitter-2013dev-A.txt"

In [27]:
trainData = []
testData = []
devData = []

def readData(path):
    data = []
    with open(path) as file:
        data = file.read()
        data = codecs.decode(data, 'unicode_escape')
        data = data.split('\n')[:-1]
    return data

trainData = readData(TRAIN_DATA)
testData = readData(TEST_DATA)
devData = readData(DEV_DATA)

  if __name__ == '__main__':
  if __name__ == '__main__':


In [28]:
trainData += devData

In [29]:
def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet 

def preprocess(data):
    cleanData = []
    for line in data:
        tId, tSent, tweet = line.split("\t")[:3] # Splitting by tabspace
        tweet = removePattern(tweet, "@[\w]*") # Removing @user tags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
#         tweet = tokenize(tweet)
        tweet = tweet.replace("#", "")
#         tweet = re.sub('\W+',' ', tweet)
        tweet = re.sub(" +", " ", tweet)
        cleanData.append([tId, tSent, tweet])
    return cleanData

def tokenize(tweet):
    return TweetTokenizer().tokenize(tweet)

In [30]:
en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [31]:
trainData = preprocess(trainData)
testData = preprocess(testData)
devData = preprocess(devData)

In [32]:
trainTweets = [x[2] for x in trainData]
X_train = np.array(trainTweets)
trainSents = [x[1] for x in trainData]
y_train = []

for x in trainSents:
    if x == "negative":
        y_train.append(-1)
    elif x == "neutral":
        y_train.append(0)
    elif x == "positive":
        y_train.append(1)
        
testTweets = [x[2] for x in testData]
X_test = np.array(testTweets)
testSents = [x[1] for x in testData]
y_test = []
for x in testSents:
    if x == "negative":
        y_test.append(-1)
    elif x == "neutral":
        y_test.append(0)
    elif x == "positive":
        y_test.append(1)

In [33]:
X = np.append(X_train, X_test)
X = vectorizer.fit_transform(X)
n = X_train.shape[0]
X_train = X[:n]
X_test = X[n:]

In [34]:
nsv = SVC(probability=True, kernel='linear', class_weight="balanced", C = 0.03)
nsv.fit(X_train, y_train)
acc = nsv.score(X_test, y_test)

In [35]:
print("Accuracy on 2016 dataset:",acc*100,"%")

Accuracy on 2016 dataset: 65.54835071891739 %


In [36]:
def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    return f1, acc, rec

In [37]:
evaluate(y_test, nsv.predict(X_test))

(0.6528112627175862, 0.6554835071891739, 0.6202334295446024)