In [97]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline, FeatureUnion, make_union, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import RobustScaler, Normalizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords


ImportError: No module named matplotlib.pyplot

In [2]:
#Load corpora
sarcastic = pd.read_csv('../data/sarcastic.csv') 
non_sarcastic = pd.read_csv('../data/non_sarcastic.csv') 

In [3]:
                                                    #Pre-processing
       
#Remove '/s' annotation
sarcastic['body'] = sarcastic['body'].apply(lambda x: x.replace('/s', ''))

#Construct dataset
dataset = pd.DataFrame()
dataset['comment'] = sarcastic['body'].append(non_sarcastic['body'])
dataset['subreddit'] = pd.Series(sarcastic['subreddit']).append(pd.Series(non_sarcastic['subreddit']))
dataset['score'] = pd.Series(sarcastic['score']).append(pd.Series(non_sarcastic['score']))
sarcastic_labels = pd.Series(np.full(len(sarcastic), 1.0))
non_sarcastic_labels = pd.Series(np.full(len(non_sarcastic), 0.0))
dataset['target'] = sarcastic_labels.append(non_sarcastic_labels)

#Remove deleted
dataset = dataset[dataset.comment != '[deleted]']

#Reset index
dataset = dataset.reset_index()

dataset


Unnamed: 0,index,comment,subreddit,score,target
0,0,Having sex with my girlfriend at least 5 times...,ForeverAlone,15,1.0
1,1,"Awesome case, plus those blue LEDs make your f...",pcmasterrace,2,1.0
2,2,I don't know man. [This](http://www.reddit.com...,WTF,3,1.0
3,3,because he is famous\n\nEdit: oh yeah,ChivalryGame,0,1.0
4,4,&gt; My deputies did their job to the fullest ...,news,1,1.0
5,5,Because what better way to woo affection from ...,cringepics,15,1.0
6,6,Slightly over half.. last I checked the world ...,TheBluePill,3,1.0
7,7,I know right. Reddit is a key component in the...,AskReddit,1,1.0
8,8,&gt; I still don't understand why anyone votes...,ukpolitics,2,1.0
9,9,yea that'll get them back in the game,nba,2,1.0


In [4]:
                                                #Pre-processing
#Remove non-English comments

def calculate_ratios(text):
    languages_ratios = {}

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) 

    return languages_ratios


def is_English(text):
    ratios = calculate_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return True if most_rated_language == 'english' else False

dataset = dataset[dataset.apply(lambda x: is_English(x['comment']), axis=1)]



In [5]:
#Convert numerical columns to numbers
dataset.apply(lambda x: pd.to_numeric(x, errors='ignore'))

#Split data (50%/30%/20%)
msk = np.random.rand(len(dataset)) < 0.5
training_data = dataset[msk]
test_data = dataset[~msk]
msk = np.random.rand(len(test_data)) < 0.3
validation_data = test_data[msk]
test_data = test_data[~msk]

In [6]:
                                                #Custom Transformers

class ItemSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    
class ArrayCaster(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data):
        print data.shape
        print np.transpose(np.matrix(data)).shape
        return np.transpose(np.matrix(data))
    
    


In [92]:
#algorithm = MultinomialNB()
#algorithm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
#algorithm = LogisticRegression(C=1e5)

n = 2
classifier = MultinomialNB()

ngram_count_vectorizer = Pipeline([
        ('extract', ItemSelector(key='comment')),
        ('ngram_tf_idf', TfidfVectorizer(ngram_range=(1, n), min_df=1))
    ])

subreddit_vectorizer = Pipeline([
        ('extract', ItemSelector(key='subreddit')),
        ('ngram_tf_idf', TfidfVectorizer(ngram_range=(1, n), min_df=1))
    ])

#N-grams and subreddit
pipeline2 = Pipeline([
  ('features', FeatureUnion([                   
    ('ngrams', ngram_count_vectorizer),
    ('subreddits', subreddit_vectorizer),
  ])),
  ('classifier', classifier)
])


text_clf = pipeline2.fit(training_data, training_data.target)

predicted = text_clf.predict(validation_data)

In [93]:
#                                                 #Model training

# #algorithm = MultinomialNB()
# #algorithm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
# #algorithm = LogisticRegression(C=1e5)

# n = 3
# classifier = LogisticRegression(C=1e5)

# # ngram_count_vectorizer = Pipeline([
# #         ('extract', ItemSelector(key='comment')),
# #         ('ngram_tf_idf', TfidfVectorizer(ngram_range=(1, n), min_df=1))
# #     ])

# # pipeline0 = Pipeline([
# #          ('ngram_tf_idf', TfidfVectorizer(ngram_range=(1, n), min_df=1)),
# #          ('clf', classifier)
# #     ])

# # text_clf = pipeline0.fit(training_data.comment, training_data.target)
# # predicted = text_clf.predict(validation_data.comment)
# #np.mean(predicted == test_data.target) 

# score_vectorizer = Pipeline([
#         ('extract', ItemSelector(key='score')),
#         ('caster', ArrayCaster()),
#         ('scale', Normalizer())
#     ])


# subreddit_vectorizer = Pipeline([
#         ('extract', ItemSelector(key='subreddit_id')),
#         ('caster', ArrayCaster())
#     ])


# #N-grams and score
# # pipeline1 = Pipeline([
# #   ('features', FeatureUnion([                   
# #     ('ngrams', ngram_count_vectorizer),
# #     ('normalized_scores', score_vectorizer),
# #   ])),
# #   ('classifier', classifier)
# # ])

# # #N-grams and subreddit
# pipeline2 = Pipeline([
#   ('features', FeatureUnion([                   
#     ('ngrams', ngram_count_vectorizer),
#     ('subreddits', subreddit_vectorizer),
#   ])),
#   ('classifier', classifier)
# ])

# # #N-grams, score and subreddit
# # pipeline3 = Pipeline([
# #   ('features', FeatureUnion([                   
# #     ('ngrams', ngram_count_vectorizer),
# #     ('normalized_scores', score_vectorizer),
# #     ('subreddits', subreddit_vectorizer),
# #   ])),
# #   ('classifier', classifier)
# # ])

# text_clf = pipeline2.fit(training_data, training_data.target)

# predicted = text_clf.predict(validation_data)
# # np.mean(predicted == test_data.target) 

In [101]:
                                                #Results
    
target_names = ['sarcastic', 'non-sarcastic']
print(classification_report(validation_data.target, predicted, target_names=target_names))
accuracy_score(validation_data.target, predicted)
tn, fp, fn, tp = confusion_matrix(validation_data.target, predicted).ravel()
tn, fp, fn, tp

               precision    recall  f1-score   support

    sarcastic       0.79      0.70      0.74      3738
non-sarcastic       0.75      0.83      0.79      4083

  avg / total       0.77      0.77      0.76      7821



(2600, 1138, 697, 3386)