In [1]:
import nltk

nltk.download('stopwords')
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saikoukuntla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/saikoukuntla/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from gensim import utils
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
import re

In [3]:
data = pd.read_csv("train.csv")
test = pd.read_csv("public_test.csv")

In [4]:
data

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.20
1,2,A man inserted an advertisement in the classif...,1,2.50,1.0,1.10
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.40
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.00
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.10
...,...,...,...,...,...,...
7995,7996,Lack of awareness of the pervasiveness of raci...,0,,,0.25
7996,7997,Why are aspirins white? Because they work sorry,1,1.33,0.0,3.85
7997,7998,"Today, we Americans celebrate our independence...",1,2.55,0.0,0.00
7998,7999,How to keep the flies off the bride at an Ital...,1,1.00,0.0,3.00


In [5]:
# Pre-Processing
#Lower-case all post
# data.text = data.text.str.lower()

#Remove handlers
data.text = data.text.apply(lambda x:re.sub('@[^\s]+','',x))
test.text = test.text.apply(lambda x:re.sub('@[^\s]+','',x))

# Remove URLS
data.text = data.text.apply(lambda x:re.sub(r"http\S+", "", x))
test.text = data.text.apply(lambda x:re.sub(r"http\S+", "", x))

# Remove all the special characters
# data.text = df.text.apply(lambda x:' '.join(re.findall(r'\w+', x)))

#remove all single characters
# df.text = df.text.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', ' ', x))

# Substituting multiple spaces with single space
data.text = data.text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
test.text = data.text.apply(lambda x:re.sub(r"http\S+", "", x))

In [6]:
# Sentiment Analysis

sid = SIA()

data['sentiments']           = data['text'].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',x.lower()))))
data['Compound Sentiment']            = data['sentiments'].apply(lambda x: x['compound']+1*(10**-6)) 
data['Positive Sentiment']   = data['sentiments'].apply(lambda x: x['pos']+1*(10**-6)) 
data['Neutral Sentiment']    = data['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
data['Negative Sentiment']   = data['sentiments'].apply(lambda x: x['neg']+1*(10**-6))

In [7]:
# Sentiment Analysis

sid = SIA()

test['sentiments']           = test['text'].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',x.lower()))))
test['Compound Sentiment']            = test['sentiments'].apply(lambda x: x['compound']+1*(10**-6)) 
test['Positive Sentiment']   = test['sentiments'].apply(lambda x: x['pos']+1*(10**-6)) 
test['Neutral Sentiment']    = test['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
test['Negative Sentiment']   = test['sentiments'].apply(lambda x: x['neg']+1*(10**-6))

In [8]:
data = data.drop(columns='sentiments')
test.drop(columns='sentiments')
data.head()

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,Compound Sentiment,Positive Sentiment,Neutral Sentiment,Negative Sentiment
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.2,0.153101,0.176001,0.672001,0.151001
1,2,A man inserted an advertisement in the classif...,1,2.5,1.0,1.1,0.510601,0.099001,0.901001,1e-06
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.4,1e-06,1e-06,1.000001,1e-06
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.0,1e-06,1e-06,1.000001,1e-06
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.1,-0.700299,0.189001,0.360001,0.450001


In [31]:
sentiments = data[['Positive Sentiment','Negative Sentiment']]
t_sentiments = test[['Positive Sentiment','Negative Sentiment']]

scaler = MinMaxScaler()
sentiments = scaler.fit_transform(sentiments)
t_sentiments = scaler.transform(t_sentiments)
sentiments = pd.DataFrame(data=sentiments,columns=['pos','neg'])
t_sentiments = pd.DataFrame(data=t_sentiments,columns=['pos','neg'])
t_sentiments.describe()

Unnamed: 0,pos,neg
count,1000.0,1000.0
mean,0.113144,0.118259
std,0.128214,0.156543
min,0.0,0.0
25%,0.0,0.0
50%,0.084,0.0
75%,0.19525,0.195187
max,0.652,1.0


In [20]:
# count vectorizer
cv = CountVectorizer(ngram_range=(1,2), binary=True)
text_counts = cv.fit_transform(data['text'])

tfidf = TfidfVectorizer()
text_tfidf = tfidf.fit_transform(data['text'])

count_vect_df = pd.DataFrame(text_counts.todense())
tfidf_df = pd.DataFrame(text_tfidf.todense())


df = pd.concat([count_vect_df, tfidf_df, sentiments], axis=1)

X_train, X_test, y_train, y_test = train_test_split(df, data['is_humor'], test_size=0.25, random_state=42)

# Modeling
CNB = MultinomialNB()
CNB.fit(X_train, y_train)
predicted = CNB.predict(X_test)
accuracy_score = metrics.accuracy_score(y_test, predicted)
f1_score = metrics.f1_score(y_test, predicted)
print(f'1-gram Accuracy: {accuracy_score:.4f}')
print(f'1-gram F1-Score: {f1_score:.4f}')
print()

1-gram Accuracy: 0.8470
1-gram F1-Score: 0.8807



In [33]:
cv = CountVectorizer(ngram_range=(1,2), binary=True)
text_counts = cv.fit_transform(data['text'])
test_text_counts = cv.transform(test['text'])

tfidf = TfidfVectorizer()
text_tfidf = tfidf.fit_transform(data['text'])
test_text_tfidf = tfidf.transform(test['text'])

count_vect_df = pd.DataFrame(text_counts.todense())
tfidf_df = pd.DataFrame(text_tfidf.todense())

test_count_vect_df = pd.DataFrame(test_text_counts.todense())
test_tfidf_df = pd.DataFrame(test_text_tfidf.todense())

In [34]:
train_df = pd.concat([count_vect_df, tfidf_df, sentiments], axis=1)
test_df = pd.concat([test_count_vect_df, test_tfidf_df, t_sentiments],axis=1)

In [21]:
hc_data = data[data['is_humor']==1]
sentiments = sentiments.loc[data['is_humor']==1,['positive','negative']]
sentiments.shape

(4932, 2)

In [30]:
# count vectorizer
cv = CountVectorizer(ngram_range=(1,2), binary=True)
text_counts = cv.fit_transform(hc_data['text'])

tfidf = TfidfVectorizer()
text_tfidf = tfidf.fit_transform(hc_data['text'])

count_vect_df = pd.DataFrame(text_counts.todense())
tfidf_df = pd.DataFrame(text_tfidf.todense())

count_vect_df = count_vect_df.reset_index(drop=True)
tfidf_df = tfidf_df.reset_index(drop=True)
sentiments = sentiments.reset_index(drop=True)

df = pd.concat([count_vect_df, tfidf_df, sentiments], axis=1)

X_train, X_test, y_train, y_test = train_test_split(df, hc_data['humor_controversy'], test_size=0.25, random_state=42)

# Modeling
CNB = MultinomialNB()
CNB.fit(X_train, y_train)
predicted = CNB.predict(X_test)
accuracy_score = metrics.accuracy_score(y_test, predicted)
f1_score = metrics.f1_score(y_test, predicted)
print(f'1-gram Accuracy: {accuracy_score:.4f}')
print(f'1-gram F1-Score: {f1_score:.4f}')
print()

1-gram Accuracy: 0.4834
1-gram F1-Score: 0.4783



### SentiwordNet Standard Deviation

In [31]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saikoukuntla/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/saikoukuntla/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/saikoukuntla/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/saikoukuntla/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [32]:
import numpy as np
import pandas as pd
import gc
import sys
import scipy
from nltk import word_tokenize, pos_tag
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.metrics import roc_auc_score, f1_score

In [33]:
data = pd.read_csv('train.csv')

In [35]:
# Tokenize sentences
text = data['text']
tokenized_text = [word_tokenize(i) for i in text]

In [37]:
# POS tag and lemmatize words, then count pos/neg words in each entry
# See https://nlpforhackers.io/sentiment-analysis-intro/ for details

lemmatizer = WordNetLemmatizer()

def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def swn_polarity(text):
    """
    Return a pos and neg score
    """
 
    pos_score = np.zeros(len(text))
    neg_score = np.zeros(len(text))
    pos_std = np.zeros(len(text))
    neg_std = np.zeros(len(text))
    
    tokens_count = 0

    for i in range(0,len(text)):
        pos=[]
        neg=[]
        tagged_entry = pos_tag(text[i])
 
        for word, tag in tagged_entry:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
 
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
 
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
 
            pos.append(swn_synset.pos_score()) 
            neg.append(swn_synset.neg_score())
        
        pos = np.array(pos)
        neg = np.array(neg)
        pos_score[i] = pos.sum()
        neg_score[i] = neg.sum()
        pos_std[i] = pos.std()
        neg_std[i] = neg.std()

    return pos_score, neg_score, pos_std, neg_std

In [38]:
pos_score, neg_score, pos_std, neg_std = swn_polarity(tokenized_text)
print(pos_score, neg_score, pos_std, neg_std)

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(
  ret = ret.dtype.type(ret / rcount)


[0.875 0.75  0.    ... 0.125 0.25  1.125] [0.25  0.    0.375 ... 0.    0.25  0.125] [0.23405972 0.13122266 0.         ... 0.04658475 0.0931695  0.22041775] [0.07856742 0.         0.13122266 ... 0.         0.0931695  0.04133986]


In [39]:
data['pos_score'] = pos_score
data['neg_score'] = neg_score
data['pos_std'] = pos_std
data['neg_std'] = neg_std

In [51]:
data['pos_std'] = data['pos_std'].fillna(0)
data['neg_std'] = data['neg_std'].fillna(0)

In [52]:
data.head()

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,pos_score,neg_score,pos_std,neg_std
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.2,0.875,0.25,0.23406,0.078567
1,2,A man inserted an advertisement in the classif...,1,2.5,1.0,1.1,0.75,0.0,0.131223,0.0
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.4,0.0,0.375,0.0,0.131223
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.0,1.75,0.5,0.337483,0.14374
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.1,1.125,1.5,0.245566,0.32476


In [53]:
X = data[['text', 'pos_score', 'neg_score', 'pos_std', 'neg_std']]
y = data['is_humor']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [54]:
_treebank_word_tokenizer = TreebankWordTokenizer()

def word_tokenize(text, language='english'):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    """
    sentences = sent_tokenize(text, language)
    return [
        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
    ] 

In [55]:
tokenizer = TreebankWordTokenizer()
vectorizer = CountVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize)
count_vec_train = vectorizer.fit_transform(X_train['text'])
count_vec_test = vectorizer.transform(X_test['text'])

In [56]:
def stack_sentiment(count_vec, pos_score, neg_score, pos_std, neg_std):
    columns = [str(i) for i in range(count_vec.shape[1])]
    stack = pd.DataFrame(count_vec.toarray(), columns=columns)
    stack['pos_score'] = pos_score.to_numpy()
    stack['neg_score'] = neg_score.to_numpy()
    stack['pos_std'] = pos_std.to_numpy()
    stack['neg_std'] = neg_std.to_numpy()
    return stack

In [57]:
X_train_stack = stack_sentiment(count_vec_train, X_train['pos_score'], X_train['neg_score'], X_train['pos_std'], X_train['neg_std'])

In [58]:
X_test_stack =  stack_sentiment(count_vec_test, X_test['pos_score'], X_test['neg_score'], X_test['pos_std'], X_test['neg_std'])

In [59]:
# Multinomial NB
mnb = MultinomialNB(alpha=0.2).fit(X_train_stack, y_train)
print(f"AUC: {roc_auc_score(y_test, mnb.predict_proba(X_test_stack)[:, 1])}")
print(f"Accuracy: {mnb.score(X_test_stack, y_test)}")
print(f"F1 score: {f1_score(y_test, mnb.predict(X_test_stack))}")

AUC: 0.945377076083497
Accuracy: 0.877
F1 score: 0.9023034154090547


In [60]:
# Complement NB
cnb = ComplementNB(alpha=0.2).fit(X_train_stack, y_train)
print(f"AUC: {roc_auc_score(y_test, cnb.predict_proba(X_test_stack)[:, 1])}")
print(f"Accuracy: {cnb.score(X_test_stack, y_test)}")
print(f"F1 score: {f1_score(y_test, cnb.predict(X_test_stack))}")

AUC: 0.9453490548381059
Accuracy: 0.88
F1 score: 0.904153354632588
