In [1]:
import pandas as pd, numpy as np, re, time
from nltk.stem.porter import PorterStemmer
import joblib

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Loading data from json file
data = pd.read_json('data/Sarcasm_Headlines_Dataset.json', lines = True)

In [4]:
data

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0


In [5]:
print(data.isnull().any(axis = 0))

article_link    False
headline        False
is_sarcastic    False
dtype: bool


In [6]:
# Relacing special symbols and digits in headline column
# re stands for Regular Expression
data['headline'] = data['headline'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

In [7]:
# getting features and labels
features = data['headline']
labels = data['is_sarcastic']

In [8]:
# Stemming our data
ps = PorterStemmer()
features = features.apply(lambda x: x.split())
features = features.apply(lambda x : ' '.join([ps.stem(word) for word in x]))

In [9]:
# vectorizing the data with maximum of 5000 features
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 5000)
features = list(features)
features = tv.fit_transform(features).toarray()

In [10]:
# getting training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, 
                                                                            labels, test_size = .05, random_state = 0)

In [11]:
# model 1:-
# Using linear support vector classifier
lsvc = LinearSVC()
# training the model
lsvc.fit(features_train, labels_train)
# getting the score of train and test data
print('Linear Support Vector Classifier: Training  Score')
print(lsvc.score(features_train, labels_train)) # 90.93
print('Linear Support Vector Machine Classifier: Test Score')
print(lsvc.score(features_test, labels_test))   # 83.75
# model 2:-
# Using Gaussuan Naive Bayes
gnb = GaussianNB()
print('\nGaussian Naive Bayes: Traning Score')
gnb.fit(features_train, labels_train)
print(gnb.score(features_train, labels_train))  # 78.86
print('Gaussian Naive Bayes: Test Score')
print(gnb.score(features_test, labels_test))    # 73.80
# model 3:-
# Logistic Regression
lr = LogisticRegression()
lr.fit(features_train, labels_train)
print('\nLogistic Regression: Training Score')
print(lr.score(features_train, labels_train))   # 88.16
print('Logistic Regression: Test Score')
print(lr.score(features_test, labels_test))     # 83.08
# model 4:-
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(features_train, labels_train)
print('\nRandom Forest Classifier: Training Score')
print(rfc.score(features_train, labels_train))  # 98.82
print('Forest Classifier: Test Score')
print(rfc.score(features_test, labels_test))    # 79.71

Linear Support Vector Classifier: Training  Score
0.9093524612777362
Linear Support Vector Machine Classifier: Test Score
0.8375748502994012

Gaussian Naive Bayes: Traning Score
0.7886335868836952
Gaussian Naive Bayes: Test Score
0.7380239520958084

Logistic Regression: Training Score
0.8816458440074094
Logistic Regression: Test Score
0.8308383233532934

Random Forest Classifier: Training Score
0.9882946439128207
Forest Classifier: Test Score
0.7971556886227545


In [12]:
model = lr

In [14]:
filename = 'data/lr_model_logreg.sav'
joblib.dump(model, filename)

['data/lr_model_logreg.sav']

In [15]:
loaded_model = joblib.load(filename)

In [16]:
tfidf_model = tv

In [19]:
joblib.dump(tfidf_model.vocabulary_, 'data/tfidf.pkl')

['data/tfidf.pkl']

In [20]:
joblib.dump(tfidf_model, 'data/tfidf_not_vocab.pkl')

['data/tfidf_not_vocab.pkl']