In [1]:
import pandas as pd
import numpy as np
import itertools
import xgboost
import tensorflow as tf
from scipy.sparse import hstack
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.models import Model
from keras.callbacks import EarlyStopping
from tqdm import tqdm
tqdm.pandas()
import pickle
import joblib

  from pandas import Panel


## Data input

In [2]:
df1 = pd.read_csv('data/twitter_sarcasm.csv').reset_index(drop=True)
df1.dropna(subset=['comment'], inplace=True)

## EDA and Visualization

In [3]:
df1.loc[df['label'] == 1, 'comment'].str.len().apply(np.log1p).hist(label='sarcastic', alpha=.5)
df1.loc[df['label'] == 0, 'comment'].str.len().apply(np.log1p).hist(label='normal', alpha=.5)
plt.legend()

NameError: name 'df' is not defined

In [None]:
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(background_color='black', stopwords = STOPWORDS,
                max_words = 200, max_font_size = 100, 
                random_state = 1, width=800, height=400)

In [None]:
plt.figure(figsize=(16, 12))
wordcloud.generate(str(df1.loc[df1['label'] == 1, 'comment']))
plt.imshow(wordcloud)

In [None]:
plt.figure(figsize=(16, 12))
wordcloud.generate(str(df1.loc[df1['label'] == 0, 'comment']))
plt.imshow(wordcloud)

In [None]:
def plot_confusion_matrix(actual, predicted, classes,                          
                        normalize=False,
                          title='Confusion matrix', figsize=(7,7),
                          cmap=plt.cm.Blues, path_to_save_fig=None):

    cm = confusion_matrix(actual, predicted).T
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.figure(figsize=figsize)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Predicted label')
    plt.xlabel('True label')
    
    if path_to_save_fig:
        plt.savefig(path_to_save_fig, dpi=300, bbox_inches='tight')
        

## Test train split

In [None]:
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(df1['comment'],df1['label'], random_state=17)

## Logistic regression with TFIDF vectorizer - content based approach

In [None]:
tf_idf1 = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, min_df=2)

logit1 = LogisticRegression(C=1, n_jobs=4, solver='lbfgs', 
                           random_state=17, verbose=1)

tfidf_logit_pipeline1 = Pipeline([('tf_idf', tf_idf1), 
                                 ('logit', logit1)])
tfidf_logit_pipeline1.fit(X_train1, y_train1)

In [None]:
valid_pred_x = tfidf_logit_pipeline1.predict(X_valid1)
accuracy_score(y_valid1, valid_pred_x)

In [None]:

plot_confusion_matrix(y_valid1, valid_pred_x, 
                      tfidf_logit_pipeline1.named_steps['logit'].classes_, figsize=(8, 8))


## XGBoost Classifier with TFIDF vectorizer

In [None]:
xgboost1 = xgboost.XGBClassifier()

xgboost_pipeline1 = Pipeline([('tf_idf', tf_idf1), 
                                 ('xgboost', xgboost1)])
xgboost_pipeline1.fit(X_train1, y_train1)

In [None]:
valid_pred_y = xgboost_pipeline1.predict(X_valid1)
accuracy_score(y_valid1, valid_pred_y)

## Recurrent Neural Networks using with TFIDF vectorizer

In [None]:

X1 = df1.comment
Y1 = df1.label
le = LabelEncoder() 
Y1 = le.fit_transform(Y1)
Y1 = Y1.reshape(-1,1)

In [None]:
X_train2,X_test2,Y_train2,Y_test2 = train_test_split(X1,Y1,test_size=0.2)

In [None]:

max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train2)
sequences = tok.texts_to_sequences(X_train2)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:

def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:

model.fit(sequences_matrix,Y_train2,batch_size=100,epochs=5,
          validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
test_sequences_x = tok.texts_to_sequences(X_test2)
test_sequences_matrix_x = sequence.pad_sequences(test_sequences_x,maxlen=max_len)

In [None]:
accr1 = model.evaluate(test_sequences_matrix_x,Y_test2)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr1[0],accr1[1]))