# Import libraries

In [24]:
import string

import pandas as pd

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load data

In [2]:
df = pd.read_csv('data/preprocessed.csv').drop('Unnamed: 0', axis=1)

In [3]:
df = df.dropna(axis=0)

## Data preprocessing

In [4]:
X = df['lemma_tokens']
y = df['is_sarcastic']

In [5]:
X = X.apply(lambda x: eval(x))

In [6]:
vectorizer = TfidfVectorizer()
temp_X = [' '.join(x) for x in X]
vect_X = vectorizer.fit_transform(temp_X)

# Split data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(vect_X, y, test_size=0.3, stratify=y, random_state=10)
print('Train shape: {}, test shape: {}'.format(len(y_train), len(y_test)))
print(y_train.value_counts())
print(y_test.value_counts())

Train shape: 16865, test shape: 7228
0    9470
1    7395
Name: is_sarcastic, dtype: int64
0    4058
1    3170
Name: is_sarcastic, dtype: int64


# Training with train-test-split and CV 10-fold

In [15]:
# Using linear support vector classifier
lsvc = LinearSVC()
# training the model
lsvc.fit(X_train, y_train)

lsvc_pred = lsvc.predict(X_test)
# getting the score of train and test data
print("Linear SVC train:",lsvc.score(X_train, y_train)) 
print("Linear SVC test:",lsvc.score(X_test, y_test)) 
print(classification_report(y_test, lsvc_pred))
cv_results = cross_validate(LinearSVC(), vect_X, y, cv=10)
print(cv_results['test_score'])

Linear SVC train: 1.0
Linear SVC test: 0.9827061427780852
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4058
           1       0.98      0.98      0.98      3170

    accuracy                           0.98      7228
   macro avg       0.98      0.98      0.98      7228
weighted avg       0.98      0.98      0.98      7228

[0.98008299 0.98298755 0.98381743 0.98755187 0.98713693 0.98713159
 0.98547115 0.98796181 0.98131229 0.98089701]


In [21]:
np.mean(cv_results['test_score'])

0.9844350615416706

In [16]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print("Logistic Regression train: ",lr.score(X_train, y_train))   
print("Logistic Regression test: ",lr.score(X_test, y_test))    
print(classification_report(y_test, lr_pred))
lr_cv_results = cross_validate(LogisticRegression(), vect_X, y, cv=10)
print(lr_cv_results['test_score'])



Logistic Regression train:  0.9903350133412393
Logistic Regression test:  0.9720531267293857
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      4058
           1       0.97      0.97      0.97      3170

    accuracy                           0.97      7228
   macro avg       0.97      0.97      0.97      7228
weighted avg       0.97      0.97      0.97      7228





[0.97344398 0.9746888  0.97759336 0.97717842 0.9780083  0.98132005
 0.97799917 0.98048983 0.97674419 0.97217608]


In [22]:
np.mean(lr_cv_results['test_score'])

0.9769642178249359

In [17]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print("Random Forest train: ",rfc.score(X_train, y_train))  
print("Random Forest test: ",rfc.score(X_test, y_test))   
print(classification_report(y_test, rfc_pred))
rfc_cv_results = cross_validate(RandomForestClassifier(), vect_X, y, cv=10)
print(rfc_cv_results['test_score'])



Random Forest train:  0.9981025793062556
Random Forest test:  0.9302711676812396
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      4058
           1       0.93      0.91      0.92      3170

    accuracy                           0.93      7228
   macro avg       0.93      0.93      0.93      7228
weighted avg       0.93      0.93      0.93      7228





[0.9253112  0.92987552 0.93112033 0.93278008 0.92904564 0.93939394
 0.93607306 0.93026152 0.93438538 0.93480066]


In [23]:
np.mean(rfc_cv_results['test_score'])

0.9323047344651902