In [25]:
import csv
import re
import codecs

import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
import pandas as pd

In [2]:
TRAIN_DATA = "../data/Subtasks_CE/twitter-2016train-CE.txt"
TEST_DATA = "../data/Subtasks_CE/twitter-2016test-CE.txt"
DEV_DATA = "../data/Subtasks_CE/twitter-2016dev-CE.txt"
DEVTEST_DATA = "../data/Subtasks_CE/twitter-2016devtest-CE.txt"

In [3]:
def readData(path):
    data = []
    with open(path) as file:
        data = file.read()
#         data = codecs.decode(data, 'unicode_escape')
        data = data.split('\n')[:-1]
    return data

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for line in data:
        tId, tAt, tSent, tweet = line.split("\t") # Splitting by tabspace
        tweet = removePattern(tweet, "@[\w]*") # Removing @user tags
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append([tId, tAt, tSent, tweet])
    return cleanData

def tokenize(tweet):
    return TweetTokenizer().tokenize(tweet)

en_stopwords = set(stopwords.words("english")) 
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [4]:
trainData   = readData(TRAIN_DATA)
testData    = readData(TEST_DATA)
devData     = readData(DEV_DATA)
devTestData = readData(DEVTEST_DATA)

In [5]:
trainData   = preprocess(trainData)
testData    = preprocess(testData)
devData     = preprocess(devData)
devTestData = preprocess(devTestData)

In [6]:
trainTweets = [x[3] for x in trainData]
X_train = np.array(trainTweets)
trainSents = [x[2] for x in trainData]
y_train = np.array(trainSents)

testTweets = [x[3] for x in testData]
X_test = np.array(testTweets)
y_test = [x[2] for x in testData]

In [7]:
X = np.append(X_train, X_test)
X = vectorizer.fit_transform(X)
n = X_train.shape[0]
X_train = X[:n]
X_test = X[n:]

In [None]:
params = {'kernel':['linear', 'rbf'],'C':[0.001, 0.01, 0.03, 0.05, 0.07 ,0.1,1, 10],'gamma':[0.000000001,0.00001, 0.001,0.1]}
clf = GridSearchCV(SVC(), params, cv =2, n_jobs = -1)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.score(X_test, y_test))

In [None]:
def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
#     return f1, acc, rec
    print("F1 score : ", f1)
    print("Avg Recall:", rec)    
    print("Accuracy:  ", acc)    

In [None]:
evaluate(y_test, clf.predict(X_test))