In [13]:
import csv
import re
import codecs

import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

[nltk_data] Downloading package stopwords to /home/trip3r/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
TRAIN_DATA2013 = "../data/Subtask_A/twitter-2013train-A.txt"
TEST_DATA2013 = "../data/Subtask_A/twitter-2013test-A.txt"
DEV_DATA2013 = "../data/Subtask_A/twitter-2013dev-A.txt"
TRAIN_DATA2016 = "../data/Subtask_A/twitter-2016train-A.txt"
TEST_DATA2016 = "../data/Subtask_A/twitter-2016test-A.txt"
DEV_DATA2016 = "../data/Subtask_A/twitter-2016dev-A.txt"

In [15]:
trainData2013 = []
testData2013 = []
devData2013 = []
trainData2016 = []
testData2016 = []
devData2016 = []

def readData(path):
    data = []
    with open(path) as file:
        data = file.read()
        data = codecs.decode(data, 'unicode_escape')
        data = data.split('\n')[:-1]
    return data

trainData2013 = readData(TRAIN_DATA2013)
testData2013 = readData(TEST_DATA2013)
devData2013 = readData(DEV_DATA2013)
trainData2016 = readData(TRAIN_DATA2016)
testData2016 = readData(TEST_DATA2016)
devData2016 = readData(DEV_DATA2016)

In [16]:
trainData2013 += devData2013
trainData2016 += devData2016

In [17]:
def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet 

def preprocess(data):
    cleanData = []
    for line in data:
        tId, tSent, tweet = line.split("\t")[:3] # Splitting by tabspace
        tweet = removePattern(tweet, "@[\w]*") # Removing @user tags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
#         tweet = tokenize(tweet)
        tweet = tweet.replace("#", "")
#         tweet = re.sub('\W+',' ', tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append([tId, tSent, tweet])
    return cleanData

def tokenize(tweet):
    return TweetTokenizer().tokenize(tweet)

In [18]:
en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [19]:
trainData = preprocess(trainData2013)
testData = preprocess(testData2013)
devData = preprocess(devData2013)

In [20]:
trainTweets = [x[2] for x in trainData]
X_train = np.array(trainTweets)
trainSents = [x[1] for x in trainData]
y_train = []

for x in trainSents:
    if x == "negative":
        y_train.append(-1)
    elif x == "neutral":
        y_train.append(0)
    elif x == "positive":
        y_train.append(1)
        
testTweets = [x[2] for x in testData]
X_test = np.array(testTweets)
testSents = [x[1] for x in testData]
y_test = []
for x in testSents:
    if x == "negative":
        y_test.append(-1)
    elif x == "neutral":
        y_test.append(0)
    elif x == "positive":
        y_test.append(1)

In [21]:
X = np.append(X_train, X_test)
X = vectorizer.fit_transform(X)
n = X_train.shape[0]
X_train = X[:n]
X_test = X[n:]

In [22]:
nsv = SVC(probability=True, kernel='linear', class_weight="balanced", C = 0.03)
nsv.fit(X_train, y_train)
acc = nsv.score(X_test, y_test)

In [24]:
def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
#     return f1, acc, rec
    print("f1 score: ", f1)
    print("avg recall", rec)    
    print("accuracy", acc)    

In [25]:
evaluate(y_test, nsv.predict(X_test))

f1 score:  0.6528112627175862
avg recall 0.6202334295446024
accuracy 0.6554835071891739
