In [189]:
from sklearn.model_selection import train_test_split
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [190]:
# read json files
headlines = []
sarcasm = []

for line in open('./Sarcasm_Headlines_Dataset.json'):
    x = json.loads(line)
    headlines.append(x['headline'])
    sarcasm.append(x['is_sarcastic'])

In [191]:
# remove all punctuations
puncs = '!"#$%&()\'*+,-./:;<=>?@[\\]^_`{|}~'

for p in puncs:
    headlines = [hd.replace(p, " ") for hd in headlines]
    
headlines = [hd.lower() for hd in headlines]

In [192]:
# remove all stopwords
stopwords = stopwords.words('english')

for i in range(26709):
    headlines[i] = " ".join([d for d in headlines[i].split() if d not in stopwords])

In [193]:
# stemming
stemmer = PorterStemmer()
headlines = [" ".join(stemmer.stem(w) for w in hd.split()) for hd in headlines]

In [194]:
# lemmatization
lemmatizer = WordNetLemmatizer()
headlines = [" ".join(lemmatizer.lemmatize(w) for w in hd.split()) for hd in headlines]

In [195]:
cv = CountVectorizer(binary=True)
cv.fit(headlines)
X = cv.transform(headlines)

x_train, x_test, y_train, y_test = train_test_split(X, sarcasm, train_size = 0.8)

In [196]:
cs = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2]

# predict accuracies using Logistic Regression
acclr = []

for c in cs:
    lr = LogisticRegression(C = c)
    lr.fit(x_train, y_train)
    acclr.append(accuracy_score(y_test, lr.predict(x_test)))
    
# predict accuracies using SVM    
accsvm = []
    
for c in cs:
    lr = LinearSVC(C = c)
    lr.fit(x_train, y_train)
    accsvm.append(accuracy_score(y_test, lr.predict(x_test)))



In [197]:
print("Accuracies using Logistic Regression: ")
print(acclr)
print("Accuracies using Support Vector Machines: ")
print(accsvm)

Accuracies using Logistic Regression: 
[0.7841632347435418, 0.7862223886184949, 0.7897791089479596, 0.7905278921752152, 0.791463871209285, 0.791089479595657, 0.7912766754024709, 0.791089479595657]
Accuracies using Support Vector Machines: 
[0.7865967802321228, 0.7772369898914264, 0.7744290527892175, 0.7716211156870086, 0.7691875701984275, 0.7690003743916136, 0.7669412205166605, 0.7656308498689629]


In [198]:
# get max accuracy and use it for training 
lrc = max(acclr)
lsvm = max(accsvm)

c = acclr.index(max(lrc, lsvm))
c /= 4

lr = LogisticRegression(C = c)
lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

In [199]:
print("Training accuracy: " + str(accuracy_score(y_train, lr.predict(x_train))))
print("Testing accuracy: " + str(accuracy_score(y_test, y_pred)))

Training accuracy: 0.9189404221463003
Testing accuracy: 0.7905278921752152
