In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
import re
import nltk
nltk.download("all")

In [None]:
pip install beautifulsoup4



In [None]:
train=pd.read_csv("Sarcasm_Dataset.csv",delimiter=",")

In [None]:
train

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
3463,3463,The population spike in Chicago in 9 months is...,0,,,,,,
3464,3464,You'd think in the second to last English clas...,0,,,,,,
3465,3465,I’m finally surfacing after a holiday to Scotl...,0,,,,,,
3466,3466,Couldn't be prouder today. Well done to every ...,0,,,,,,


# Binary Classification

In [None]:
col_list = ["tweet", "sarcastic"]
df = pd.read_csv("Sarcasm_Dataset.csv", usecols=col_list)

df

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


In [None]:
#Sample tweet
print(df['tweet'][0])

The only thing I got from college is a caffeine addiction


# Text Pre-Processing Pipeline

In [None]:
from nltk.corpus import stopwords
stopwords=stopwords.words("english")

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
def clean_tweets(raw_text,stopwords=stopwords):
    '''Golden function for cleaning text data'''
    
    # Removing HTML Tags
    html_removed_text=BeautifulSoup(raw_text).get_text()
    
    # Remove any non character
    character_only_text=re.sub("[^a-zA-Z]"," ",html_removed_text)
    
    # Lowercase and split
    lower_text=character_only_text.lower().split()
    #Get STOPWORDS and remove
    stop_remove_text=[i for i in lower_text if not i in stopwords]
    
    #Lemmatization
    lemma_removed_text=[wordnet_lemmatizer.lemmatize(word,'v') for word in stop_remove_text]
    
    # Remove one character words
    lemma_removed_text=[word for word in stop_remove_text if len(word)>1]
    
    return " ".join(lemma_removed_text)

In [None]:
# check on sample
df.loc[:1,"tweet"].apply(clean_tweets)[0]

'thing got college caffeine addiction'

In [None]:
# original Review
df.loc[0,"tweet"]

'The only thing I got from college is a caffeine addiction'

In [None]:
df.dropna(subset = ["tweet"], inplace=True)

In [None]:
df['clean_tweet']=df['tweet'].apply(clean_tweets)
df

  ' that document to Beautiful Soup.' % decoded_markup


Unnamed: 0,tweet,sarcastic,clean_tweet
0,The only thing I got from college is a caffein...,1,thing got college caffeine addiction
1,I love it when professors draw a big question ...,1,love professors draw big question mark next an...
2,Remember the hundred emails from companies whe...,1,remember hundred emails companies covid starte...
3,Today my pop-pop told me I was not “forced” to...,1,today pop pop told forced go college okay sure...
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,volphancarol littlewhitty mysticalmanatee also...
...,...,...,...
3463,The population spike in Chicago in 9 months is...,0,population spike chicago months ridiculous
3464,You'd think in the second to last English clas...,0,think second last english class year prof woul...
3465,I’m finally surfacing after a holiday to Scotl...,0,finally surfacing holiday scotland difficult d...
3466,Couldn't be prouder today. Well done to every ...,0,prouder today well done every student got gcse...


In [None]:
from collections import Counter
word_counter=Counter(" ".join(df['clean_tweet'].tolist()).split())

In [None]:
word_counter.most_common(4)

[('co', 295), ('https', 282), ('like', 271), ('love', 217)]

In [None]:
#Top Words in negative reviews
negative_word_counter=Counter(" ".join(df.loc[df['sarcastic']==1,'clean_tweet'].tolist()).split())

#Top words in positive reviews
positive_word_counter=Counter(" ".join(df.loc[df['sarcastic']==0,'clean_tweet'].tolist()).split())

In [None]:
negative_word_counter.most_common(10)

[('love', 84),
 ('like', 70),
 ('get', 59),
 ('day', 59),
 ('one', 44),
 ('time', 44),
 ('people', 42),
 ('co', 41),
 ('really', 41),
 ('https', 39)]

In [None]:
positive_word_counter.most_common(10)

[('co', 254),
 ('https', 243),
 ('like', 201),
 ('one', 164),
 ('time', 156),
 ('get', 152),
 ('people', 148),
 ('love', 133),
 ('day', 114),
 ('really', 107)]

# Baseline Model
# Here we see a high overlap in unigram between two categories(here its positive or negative)
# Then the next thing we should try is to look for bigrams or trigrams

# Bag of Words - Model
# Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

Split the data

In [None]:
X=df['clean_tweet'] #Predictor
y=df['sarcastic'] #Target

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
def create_vector(vectorizer,data):
    '''Pass vectorizer and data'''
    train_vector=vectorizer.transform(data.tolist())
    return train_vector.toarray()

In [None]:
vectorizer = CountVectorizer(max_features=1000)
vectorizer.fit(X_train.tolist())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=1000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
X_train_vector=create_vector(vectorizer,X_train)
X_test_vector=create_vector(vectorizer,X_test)

In [None]:
X_test_vector.shape, X_train_vector.shape

((1145, 1000), (2322, 1000))

# Create ML Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

forest=RandomForestClassifier()
forest.fit(X_train_vector,y_train)

y_pred=forest.predict(X_test_vector)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       840
           1       0.36      0.19      0.25       305

    accuracy                           0.69      1145
   macro avg       0.55      0.53      0.53      1145
weighted avg       0.65      0.69      0.66      1145



# TFIDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)

In [None]:
tfidf = transformer.fit_transform(X_train_vector)