# imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
! pip install numpy matplotlib seaborn nltk



In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer

# Load Data

In [4]:
data = pd.read_csv('./data/sarcasm.csv')
data.drop(columns=['Unnamed: 0',], inplace=True)
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [5]:
df = data.copy()

# Natural Language Processing

## Tokenization

In [6]:
df.shape

(28619, 3)

In [7]:
headlines = df.headline.apply(lambda row: row.lower())
is_sarcastic = df.is_sarcastic

In [8]:
tokenized_headlines = headlines.apply(word_tokenize)

In [9]:
tokenized_headlines.iloc[0]

['thirtysomething',
 'scientists',
 'unveil',
 'doomsday',
 'clock',
 'of',
 'hair',
 'loss']

## stopwords filtering

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
stop_words = stopwords.words('english')

In [12]:
def remove_stopwords(sentence: list) -> list:
    return [
        word for word in sentence if (word.casefold() not in stop_words) and word.isalpha()
    ]

In [13]:
filtered_tokens = pd.Series(map(remove_stopwords, tokenized_headlines))

In [14]:
filtered_tokens

0        [thirtysomething, scientists, unveil, doomsday...
1        [dem, totally, nails, congress, falling, short...
2          [eat, veggies, deliciously, different, recipes]
3        [inclement, weather, prevents, liar, getting, ...
4        [mother, comes, pretty, close, using, word, co...
                               ...                        
28614         [jews, celebrate, rosh, hashasha, something]
28615    [internal, affairs, investigator, disappointed...
28616    [beautiful, acceptance, speech, week, came, qu...
28617    [mars, probe, destroyed, orbiting, space, palace]
28618                         [dad, clarifies, food, stop]
Length: 28619, dtype: object

## Lemmatization

In [15]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
lemmatizer = WordNetLemmatizer()

def lemmatize(sentence: list) -> list:
    return ' '.join(map(lambda word: lemmatizer.lemmatize(word), sentence))

In [17]:
lemmatized_tokens = filtered_tokens.apply(lemmatize)

In [18]:
lemmatized_tokens

0        thirtysomething scientist unveil doomsday cloc...
1        dem totally nail congress falling short gender...
2                  eat veggie deliciously different recipe
3             inclement weather prevents liar getting work
4            mother come pretty close using word correctly
                               ...                        
28614                jew celebrate rosh hashasha something
28615    internal affair investigator disappointed cons...
28616    beautiful acceptance speech week came queer ko...
28617            mar probe destroyed orbiting space palace
28618                              dad clarifies food stop
Length: 28619, dtype: object

## Feature Extraction

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tfidf_vectorizer = TfidfVectorizer()
features = pd.DataFrame(
    tfidf_vectorizer.fit_transform(lemmatized_tokens).toarray()
    ).astype(np.float16)
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21773,21774,21775,21776,21777,21778,21779,21780,21781,21782
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28614,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# PCA - Dimentionality Reduction

In [37]:
from sklearn.decomposition import PCA
dim_reductor = PCA(
    n_components=100
    )
features = dim_reductor.fit_transform(features)

## split dataset

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
xtrain, xtest, ytrain, ytest = train_test_split(
    features, is_sarcastic, test_size=0.3, random_state=33
) 

# Training Classifier

In [42]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [43]:
clf = GaussianNB()
clf.fit(xtrain, ytrain)

## evaluate

In [44]:
ypred = clf.predict(xtest)
report = classification_report(y_pred=ypred, y_true=ytest)

In [45]:
print(report)

              precision    recall  f1-score   support

           0       0.64      0.46      0.54      4494
           1       0.55      0.71      0.62      4092

    accuracy                           0.58      8586
   macro avg       0.59      0.59      0.58      8586
weighted avg       0.60      0.58      0.58      8586

