# Install Dependencies

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peshw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peshw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load and View Data

In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import confusion_matrix, classification_report
#

In [3]:
help(classification_report)

Help on function classification_report in module sklearn.metrics._classification:

classification_report(y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn')
    Build a text report showing the main classification metrics.
    
    Read more in the :ref:`User Guide <classification_report>`.
    
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.
    
    y_pred : 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.
    
    labels : array-like of shape (n_labels,), default=None
        Optional list of label indices to include in the report.
    
    target_names : list of str of shape (n_labels,), default=None
        Optional display names matching the labels (same order).
    
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

In [4]:
X_train = pd.read_csv('./data/Train_Data.csv')
X_train.info()
X_test = pd.read_csv('./data/Test_Data.csv')
X_test['is_sarcastic'] = np.float16(0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44262 entries, 0 to 44261
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      44262 non-null  object
 1   is_sarcastic  44262 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 691.7+ KB


In [5]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11066 entries, 0 to 11065
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   headline      11066 non-null  object 
 1   is_sarcastic  11066 non-null  float16
dtypes: float16(1), object(1)
memory usage: 108.2+ KB


In [6]:
X_train.shape, X_test.shape

((44262, 2), (11066, 2))

In [7]:
X_train.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


In [8]:
import string

X_train['char_count'] = X_train['headline'].apply(len).astype(np.float16)
X_train['word_count'] = X_train['headline'].apply(lambda x: len(x.split())).astype(np.float16)
X_train['word_density'] = (X_train['char_count'] / (X_train['word_count']+1)).astype(np.float16)
X_train['punctuation_count'] = X_train['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) .astype(np.float16)
X_train['title_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()])).astype(np.float16)
X_train['upper_case_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()])).astype(np.float16)
Y_train = X_train['is_sarcastic']

X_test['char_count'] = X_test['headline'].apply(len).astype(np.float16)
X_test['word_count'] = X_test['headline'].apply(lambda x: len(x.split())).astype(np.float16)
X_test['word_density'] = (X_test['char_count'] / (X_test['word_count']+1)).astype(np.float16)
X_test['punctuation_count'] = X_test['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))).astype(np.float16)
X_test['title_word_count'] = X_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()])).astype(np.float16)
X_test['upper_case_word_count'] = X_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()])).astype(np.float16)
Y_test = X_test['is_sarcastic']


In [9]:
X_train

Unnamed: 0,headline,is_sarcastic,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
0,supreme court votes 7-2 to legalize all worldl...,1,53.0,9.0,5.300781,1.0,0.0,0.0
1,hungover man horrified to learn he made dozens...,1,66.0,12.0,5.078125,0.0,0.0,0.0
2,emily's list founder: women are the 'problem s...,0,65.0,10.0,5.910156,4.0,0.0,0.0
3,send your kids back to school with confidence,0,45.0,8.0,5.000000,0.0,0.0,0.0
4,watch: experts talk pesticides and health,0,41.0,6.0,5.855469,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
44257,greece seeks to reassure europe as tensions rise,0,48.0,8.0,5.332031,0.0,0.0,0.0
44258,vatican says transgender man cannot become a g...,0,54.0,8.0,6.000000,0.0,0.0,0.0
44259,protesters ejected from donald trump rally aft...,0,80.0,11.0,6.667969,0.0,0.0,0.0
44260,italian recipes that are oldies but goodies,0,43.0,7.0,5.375000,0.0,0.0,0.0


In [10]:
Y_train

0        1
1        1
2        0
3        0
4        0
        ..
44257    0
44258    0
44259    0
44260    0
44261    1
Name: is_sarcastic, Length: 44262, dtype: int64

In [11]:
len(X_test)

11066

# Training a Logistic Regression Model

Here we will train a logistic regression model to the feature vectors in training data.


Model Evaluation Metrics - Quick Refresher

Just accuracy is never enough in datasets with a rare class problem.

    Precision: The positive predictive power of a model. Out of all the predictions made by a model for a class, how many are actually correct
    Recall: The coverage or hit-rate of a model. Out of all the test data samples belonging to a class, how many was the model able to predict (hit or cover) correctly.
    F1-score: The harmonic mean of the precision and recall

Do check out ROC Curve, AUC Score and PR Curve also


In [12]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1, random_state=42, solver='liblinear')

In [13]:
lr.fit(X_train.drop(['headline'], axis=1), Y_train)
predictions = lr.predict(X_test.drop(['headline'], axis=1)).astype(np.float16)

print(classification_report(Y_test, predictions))
pd.DataFrame(confusion_matrix(Y_test, predictions))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     11066

    accuracy                           1.00     11066
   macro avg       1.00      1.00      1.00     11066
weighted avg       1.00      1.00      1.00     11066



Unnamed: 0,0
0,11066


.



<h2>Features from Sentiment Analysis</h2>

Remember this is unsupervised, lexicon-based sentiment analysis where we don't have any pre-labeled data saying which review migth have a positive or negative sentiment. We use the lexicon to determine this.

In [14]:
import textblob

In [15]:
x_train_snt_obj = X_train['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_train['Polarity'] = [obj.polarity for obj in x_train_snt_obj.values]
X_train['Subjectivity'] = [obj.subjectivity for obj in x_train_snt_obj.values]

x_test_snt_obj = X_test['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_test['Polarity'] = [obj.polarity for obj in x_test_snt_obj.values]
X_test['Subjectivity'] = [obj.subjectivity for obj in x_test_snt_obj.values]

In [16]:
X_train.head(20)

Unnamed: 0,headline,is_sarcastic,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
0,supreme court votes 7-2 to legalize all worldl...,1,53.0,9.0,5.300781,1.0,0.0,0.0,0.0,0.0
1,hungover man horrified to learn he made dozens...,1,66.0,12.0,5.078125,0.0,0.0,0.0,0.0,0.066667
2,emily's list founder: women are the 'problem s...,0,65.0,10.0,5.910156,4.0,0.0,0.0,0.0,0.0
3,send your kids back to school with confidence,0,45.0,8.0,5.0,0.0,0.0,0.0,0.0,0.0
4,watch: experts talk pesticides and health,0,41.0,6.0,5.855469,1.0,0.0,0.0,0.0,0.0
5,james corden and the red hot chili peppers str...,0,75.0,13.0,5.355469,2.0,0.0,0.0,0.031481,0.37963
6,u.s. dignity reserves nearly depleted,1,37.0,5.0,6.167969,2.0,0.0,0.0,0.1,0.4
7,"how to re-ignite the spark in your body, mind ...",0,54.0,11.0,4.5,2.0,0.0,0.0,0.0,0.0
8,report: there still time to convert to christi...,1,75.0,11.0,6.25,1.0,0.0,0.0,0.0,0.0
9,education reform and evidence,0,29.0,4.0,5.800781,0.0,0.0,0.0,0.0,0.0


In [17]:
lr.fit(X_train.drop(['headline'], axis=1), Y_train, )
predictions = lr.predict(X_test.drop(['headline'], axis=1)).astype(np.float16)

print(classification_report(Y_test, predictions))
pd.DataFrame(confusion_matrix(Y_test, predictions))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     11066

    accuracy                           1.00     11066
   macro avg       1.00      1.00      1.00     11066
weighted avg       1.00      1.00      1.00     11066



Unnamed: 0,0
0,11066


<h2> Text Pre-processing and Wrangling</h2>

We want to extract some specific features based on standard NLP feature engineering models like the classic Bag of Words model. For this we need to clean and pre-process our text data. We will build a simple text pre-processor here since the main intent is to look at feature engineering strategies.

We will focus on:

    Text Lowercasing
    Removal of contractions
    Removing unnecessary characters, numbers and symbols
    Stemming
    Stopword removal



In [18]:
import contractions

In [19]:
import nltk
import contractions
import re

# remove some stopwords to capture negation in n-grams if possible
stop_words = nltk.corpus.stopwords.words('english')
#stop_words.remove('no')
#stop_words.remove('not')
#stop_words.remove('but')

# load up a simple porter stemmer - nothing fancy
ps = nltk.porter.PorterStemmer()

def simple_text_preprocessor(document): 
    # lower case
    document = str(document).lower()
    
    # expand contractions
    document = contractions.fix(document)
    
    # remove unnecessary characters
    document = re.sub(r'[^a-zA-Z]',r' ', document)
    document = re.sub(r'nbsp', r'', document)
    document = re.sub(' +', ' ', document)
    
    # simple porter stemming
    document = ' '.join([ps.stem(word) for word in document.split()])
    
    # stopwords removal
    document = ' '.join([word for word in document.split() if word not in stop_words])
    
    return document

stp = np.vectorize(simple_text_preprocessor)

In [20]:
X_train['Clean headline'] = stp(X_train['headline'].values)
X_test['Clean headline'] = stp(X_test['headline'].values)

X_train.head()

Unnamed: 0,headline,is_sarcastic,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,Clean headline
0,supreme court votes 7-2 to legalize all worldl...,1,53.0,9.0,5.300781,1.0,0.0,0.0,0.0,0.0,suprem court vote legal worldli vice
1,hungover man horrified to learn he made dozens...,1,66.0,12.0,5.078125,0.0,0.0,0.0,0.0,0.066667,hungov man horrifi learn made dozen plan last ...
2,emily's list founder: women are the 'problem s...,0,65.0,10.0,5.910156,4.0,0.0,0.0,0.0,0.0,emili list founder women problem solver congress
3,send your kids back to school with confidence,0,45.0,8.0,5.0,0.0,0.0,0.0,0.0,0.0,send kid back school confid
4,watch: experts talk pesticides and health,0,41.0,6.0,5.855469,1.0,0.0,0.0,0.0,0.0,watch expert talk pesticid health


.




<h2>Extracting out the structured features from previous experiments</h2>



In [21]:
X_train_metadata = X_train.drop(['headline', 'Clean headline'], axis=1).reset_index(drop=True).astype(np.float16)
X_test_metadata = X_test.drop(['headline', 'Clean headline'], axis=1).reset_index(drop=True).astype(np.float16)

X_train_metadata.head()

Unnamed: 0,is_sarcastic,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
0,1.0,53.0,9.0,5.300781,1.0,0.0,0.0,0.0,0.0
1,1.0,66.0,12.0,5.078125,0.0,0.0,0.0,0.0,0.06665
2,0.0,65.0,10.0,5.910156,4.0,0.0,0.0,0.0,0.0
3,0.0,45.0,8.0,5.0,0.0,0.0,0.0,0.0,0.0
4,0.0,41.0,6.0,5.855469,1.0,0.0,0.0,0.0,0.0


.



<h2>Experiment 3: Adding Bag of Words based Features - 1- and 2-grams</h2>

This is perhaps the most simple vector space representational model for unstructured text. A vector space model is simply a mathematical model to represent unstructured text (or any other data) as numeric vectors, such that each dimension of the vector is a specific feature\attribute.

The bag of words model represents each text document as a numeric vector where each dimension is a specific word from the corpus and the value could be its frequency in the document, occurrence (denoted by 1 or 0) or even weighted values.

The model’s name is such because each document is represented literally as a ‘bag’ of its own words, disregarding word orders, sequences and grammar.



In [32]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1, 1))
X_traincv = cv.fit_transform(X_train['Clean headline']).toarray().astype(np.float16)
X_traincv = pd.DataFrame(X_traincv, columns=cv.get_feature_names_out()).astype(np.float16)

X_testcv = cv.transform(X_test['Clean headline']).toarray()
X_testcv = pd.DataFrame(X_testcv, columns=cv.get_feature_names_out()).astype(np.float16)
X_traincv.head()

Unnamed: 0,aaa,aaron,aarp,aatish,ab,abandon,abaya,abba,abbey,abbi,...,zoo,zookeep,zooland,zoologist,zoom,zoroastrian,zsa,zucker,zuckerberg,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
X_train_comb = pd.concat([X_train_metadata, X_traincv], axis=1).astype(np.float16)
X_test_comb = pd.concat([X_test_metadata, X_testcv], axis=1).astype(np.float16)

X_train_comb.head()

Unnamed: 0,is_sarcastic,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,aaa,...,zoo,zookeep,zooland,zoologist,zoom,zoroastrian,zsa,zucker,zuckerberg,zz
0,1.0,53.0,9.0,5.300781,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,66.0,12.0,5.078125,0.0,0.0,0.0,0.0,0.06665,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,65.0,10.0,5.910156,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,45.0,8.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,41.0,6.0,5.855469,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
lr.fit(X_train_comb, Y_train)
predictions = lr.predict(X_test_comb)

print(classification_report(Y_test, predictions))
pd.DataFrame(confusion_matrix(Y_test, predictions))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     11066

    accuracy                           1.00     11066
   macro avg       1.00      1.00      1.00     11066
weighted avg       1.00      1.00      1.00     11066



Unnamed: 0,0
0,11066


In [35]:
print(Y_train[:25])

0     1
1     1
2     0
3     0
4     0
5     0
6     1
7     0
8     1
9     0
10    1
11    0
12    1
13    1
14    1
15    0
16    0
17    0
18    0
19    1
20    1
21    0
22    1
23    0
24    0
Name: is_sarcastic, dtype: int64


In [36]:
print(predictions[:25])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [37]:
sum(predictions)

0