In [1]:
import io
import sys
import os
import numpy as np
import pandas as pd
import nltk
import gensim
import csv, collections
from textblob import TextBlob
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle



Define a class to load the SentimentWordnet and write methods to calculate the scores

In [2]:
class load_senti_word_net(object):
    """
    constructor to load the file and read the file as CSV
    6 columns - pos, ID, PosScore, NegScore, synsetTerms, gloss
    synsetTerms can have multiple similar words like abducting#1 abducent#1 and will read each one and calculaye the scores
    """
    
    def __init__(self):
        sent_scores = collections.defaultdict(list)
        with io.open("/home/surajr/Downloads/SentiWordNet_3.0.0_20130122.txt") as fname:
            file_content = csv.reader(fname, delimiter='\t',quotechar='"')
            
            for line in file_content:                
                if line[0].startswith('#') :
                    continue                    
                pos, ID, PosScore, NegScore, synsetTerms, gloss = line
                for terms in synsetTerms.split(" "):
                    term = terms.split("#")[0]
                    term = term.replace("-","").replace("_","")
                    key = "%s/%s"%(pos,term.split("#")[0])
                    try:
                        sent_scores[key].append((float(PosScore),float(NegScore)))
                    except:
                        sent_scores[key].append((0,0))
                    
        for key, value in sent_scores.iteritems():
            sent_scores[key] = np.mean(value,axis=0)
        
        self.sent_scores = sent_scores    
     
    """
    For a word,
    nltk.pos_tag(["Suraj"])
    [('Suraj', 'NN')]
    """
    
    def score_word(self, word):
        pos = nltk.pos_tag([word])[0][1]
        return self.score(word, pos)
    
    def score(self,word, pos):
        """
        Identify the type of POS, get the score from the senti_scores and return the score
        """
        
        if pos[0:2] == 'NN':
            pos_type = 'n'
        elif pos[0:2] == 'JJ':
            pos_type = 'a'
        elif pos[0:2] =='VB':
            pos_type='v'
        elif pos[0:2] =='RB':
            pos_type = 'r'
        else:
            pos_type =  0
            
        if pos_type != 0 :    
            loc = pos_type+'/'+word
            score = self.sent_scores[loc]
            if len(score)>1:
                return score
            else:
                return np.array([0.0,0.0])
        else:
            return np.array([0.0,0.0])
        
    """
    Repeat the same for a sentence
    nltk.pos_tag(word_tokenize("My name is Suraj"))
    [('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Suraj', 'NNP')]    
    """    
        
    def score_sentencce(self, sentence):
        pos = nltk.pos_tag(sentence)
        print (pos)
        mean_score = np.array([0.0, 0.0])
        for i in range(len(pos)):
            mean_score += self.score(pos[i][0], pos[i][1])
            
        return mean_score
    
    def pos_vector(self, sentence):
        pos_tag = nltk.pos_tag(sentence)
        vector = np.zeros(4)
        
        for i in range(0, len(pos_tag)):
            pos = pos_tag[i][1]
            if pos[0:2]=='NN':
                vector[0] += 1
            elif pos[0:2] =='JJ':
                vector[1] += 1
            elif pos[0:2] =='VB':
                vector[2] += 1
            elif pos[0:2] == 'RB':
                vector[3] += 1
                
        return vector
            

Now let's extract the features

In [3]:
porter = nltk.PorterStemmer()
sentiments = load_senti_word_net()

In [4]:
import replace_emoji

In [5]:
def gram_features(features,sentence):
    sentence_rep = replace_emoji.replace_reg(str(sentence))
    token = nltk.word_tokenize(sentence_rep)
    #token = [porter.stem(i.lower()) for i in token]        
    
    bigrams = nltk.bigrams(token)
    bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams]
    grams = token + bigrams
    #print (grams)
    #for t in grams:
        #features['contains(%s)'%t]=1.0
    

In [6]:
import string
def sentiment_extract(features, sentence):
    sentence_rep = replace_emoji.replace_reg(sentence)
    token = nltk.word_tokenize(sentence_rep)    
    #token = [porter.stem(i.lower()) for i in token]   
    mean_sentiment = sentiments.score_sentencce(token)
    features["Positive Sentiment"] = mean_sentiment[0]
    features["Negative Sentiment"] = mean_sentiment[1]
    features["sentiment"] = mean_sentiment[0] - mean_sentiment[1]
    #print(mean_sentiment[0], mean_sentiment[1])
    
    try:
        text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in token]).strip())
        features["Blob Polarity"] = text.sentiment.polarity
        features["Blob Subjectivity"] = text.sentiment.subjectivity
        #print (text.sentiment.polarity,text.sentiment.subjectivity )
    except:
        features["Blob Polarity"] = 0
        features["Blob Subjectivity"] = 0
        print("do nothing")
        
    
    first_half = token[0:len(token)/2]    
    mean_sentiment_half = sentiments.score_sentencce(first_half)
    features["positive Sentiment first half"] = mean_sentiment_half[0]
    features["negative Sentiment first half"] = mean_sentiment_half[1]
    features["first half sentiment"] = mean_sentiment_half[0]-mean_sentiment_half[1]
    try:
        text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in first_half]).strip())
        features["first half Blob Polarity"] = text.sentiment.polarity
        features["first half Blob Subjectivity"] = text.sentiment.subjectivity
        #print (text.sentiment.polarity,text.sentiment.subjectivity )
    except:
        features["first Blob Polarity"] = 0
        features["first Blob Subjectivity"] = 0
        print("do nothing")
    
    second_half = token[len(token)/2:]
    mean_sentiment_sechalf = sentiments.score_sentencce(second_half)
    features["positive Sentiment second half"] = mean_sentiment_sechalf[0]
    features["negative Sentiment second half"] = mean_sentiment_sechalf[1]
    features["second half sentiment"] = mean_sentiment_sechalf[0]-mean_sentiment_sechalf[1]
    try:
        text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in second_half]).strip())
        features["second half Blob Polarity"] = text.sentiment.polarity
        features["second half Blob Subjectivity"] = text.sentiment.subjectivity
        #print (text.sentiment.polarity,text.sentiment.subjectivity )
    except:
        features["second Blob Polarity"] = 0
        features["second Blob Subjectivity"] = 0
        print("do nothing")  
    
    
    

In [7]:
features = {}
sentiment_extract(features,"a long narrow opening")

[('a', 'DT'), ('long', 'JJ'), ('narrow', 'JJ'), ('opening', 'NN')]
[('a', 'DT'), ('long', 'JJ')]
[('narrow', 'JJ'), ('opening', 'NN')]


In [8]:
def pos_features(features,sentence):
    sentence_rep = replace_emoji.replace_reg(sentence)
    token = nltk.word_tokenize(sentence_rep)
    #token = [ porter.stem(each.lower()) for each in token]
    pos_vector = sentiments.pos_vector(token)
    for j in range(len(pos_vector)):
        features['POS_'+str(j+1)] = pos_vector[j]
    print ("done")
    

In [9]:
features = {}
pos_features(features,"a long narrow opening")

done


In [10]:
def capitalization(features,sentence):
    count = 0
    for i in range(len(sentence)):
        count += int(sentence[i].isupper())
    features['Capitalization'] = int(count > 3)
    print (count)

In [11]:
features = {}
capitalization(features,"A LoNg NArrow opening")

5


In [15]:
import topic
topic_mod = topic.topic(nbtopic=200,alpha='symmetric')

In [16]:
topic_mod = topic.topic(model=os.path.join('topics.tp'),dicttp=os.path.join('topics_dict.tp'))

In [17]:
def topic_feature(features,sentence,topic_modeler):    
    topics = topic_modeler.transform(sentence)    
    for j in range(len(topics)):
        features['Topic :'] = topics[j][1]
    

In [18]:
topic_feature(features,"A LoNg NArrow opening",topic_mod)

In [19]:
def get_features(sentence, topic_modeler):
    features = {}
    gram_features(features,sentence)
    pos_features(features,sentence)
    sentiment_extract(features, sentence)
    capitalization(features,sentence)
    topic_feature(features, sentence,topic_modeler)
    return features

In [17]:
df = pd.DataFrame()
df = pd.read_csv("dataset_csv.csv", header=0, sep='\t')
df.head()

Unnamed: 0,tweets,label
0,I love working midnights tweet,1
1,I hate when I buy a bag of air and there's chi...,1
2,my grandad always sounds so ill when i speak t...,0
3,"I realize I'm annoying to everyone, so I won't...",0
4,I love when I find these dudes on vine!! #Foll...,1


In [1]:
featureset = []

In [17]:
import re

for i in range(0,df.size):
    temp = str(df["tweets"][i])
    temp = re.sub(r'[^\x00-\x7F]+','',temp)
    featureset.append((get_features(temp,topic_mod), df["label"][i]))

done
[('I', 'PRP'), ('love', 'VBP'), ('working', 'VBG'), ('midnights', 'NNS'), ('tweet', 'NN')]
[('I', 'PRP'), ('love', 'VBP')]
[('working', 'VBG'), ('midnights', 'NNS'), ('tweet', 'NN')]
1
done
[('I', 'PRP'), ('hate', 'VBP'), ('when', 'WRB'), ('I', 'PRP'), ('buy', 'VBP'), ('a', 'DT'), ('bag', 'NN'), ('of', 'IN'), ('air', 'NN'), ('and', 'CC'), ('there', 'EX'), ("'s", 'VBZ'), ('chips', 'NNS'), ('in', 'IN'), ('it', 'PRP'), ('#', '#'), ('not', 'RB')]
[('I', 'PRP'), ('hate', 'VBP'), ('when', 'WRB'), ('I', 'PRP'), ('buy', 'VBP'), ('a', 'DT'), ('bag', 'NN'), ('of', 'IN')]
[('air', 'NN'), ('and', 'CC'), ('there', 'EX'), ("'s", 'VBZ'), ('chips', 'NNS'), ('in', 'IN'), ('it', 'PRP'), ('#', '#'), ('not', 'RB')]
2
done
[('my', 'PRP$'), ('grandad', 'NN'), ('always', 'RB'), ('sounds', 'VBZ'), ('so', 'RB'), ('ill', 'JJ'), ('when', 'WRB'), ('i', 'NN'), ('speak', 'VBP'), ('to', 'TO'), ('him', 'PRP'), ('on', 'IN'), ('the', 'DT'), ('phone', 'NN')]
[('my', 'PRP$'), ('grandad', 'NN'), ('always', 'RB'), ('s

KeyError: 1994

In [20]:
c = []
for i in range(0,len(featureset)):
    c.append(pd.DataFrame(featureset[i][0],index=[i]))

result = pd.concat(c)

In [22]:
result.insert(loc=0,column="label",value='0')

In [23]:
for i in range(0, len(featureset)):
    result["label"].loc[i] = featureset[i][1]   
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [24]:
featureset[39][1]

1

In [25]:
result.to_csv('feature_dataset.csv')

In [20]:
df = pd.DataFrame()
df = pd.read_csv("feature_dataset.csv", header=0)
df.head()

Unnamed: 0.1,Unnamed: 0,label,Blob Polarity,Blob Subjectivity,Capitalization,Negative Sentiment,POS_1,POS_2,POS_3,POS_4,...,first half Blob Subjectivity,first half sentiment,negative Sentiment first half,negative Sentiment second half,positive Sentiment first half,positive Sentiment second half,second half Blob Polarity,second half Blob Subjectivity,second half sentiment,sentiment
0,0,1,0.5,0.6,0,0.03125,2.0,0.0,2.0,0.0,...,0.6,0.59375,0.03125,0.0,0.625,0.0,0.0,0.0,0.0,0.59375
1,1,1,-0.8,0.9,0,1.444444,3.0,0.0,3.0,1.0,...,0.9,-0.638889,0.819444,0.625,0.180556,0.125,0.0,0.0,-0.5,-1.138889
2,2,0,-0.5,1.0,0,0.725,3.0,1.0,2.0,2.0,...,1.0,-0.6,0.725,0.0,0.125,0.0,0.0,0.0,0.0,-0.6
3,3,0,-0.8,0.9,1,0.778409,1.0,0.0,7.0,3.0,...,0.9,-0.541667,0.625,0.153409,0.083333,0.0625,0.0,0.0,-0.090909,-0.632576
4,4,1,0.78125,0.6,1,0.039062,4.0,0.0,2.0,1.0,...,0.6,0.601562,0.039062,0.0,0.640625,0.0,0.0,0.0,0.0,0.601562


In [22]:
%matplotlib inline

import matplotlib as matplot 
import seaborn

result = df

In [None]:
result.groupby(result['label','Unnamed: 0']).size()

In [47]:
resultset = result #backup

In [55]:
X = result.drop(['label','Unnamed: 0','Topic :'],axis=1).values

In [56]:
Y = result['label']

In [25]:
import pickle
import pefile
import sklearn.ensemble as ek
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression

In [45]:
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
         "RandomForest":ek.RandomForestClassifier(n_estimators=50),
         "Adaboost":ek.AdaBoostClassifier(n_estimators=50),
         "GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
         "GNB":GaussianNB(),
         "Logistic Regression":LinearRegression()   
}

In [57]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y ,test_size=0.2)

In [58]:
X_train = pd.DataFrame(X_train)
X_train = X_train.fillna(X_train.mean())

X_test = pd.DataFrame(X_test)
X_test = X_test.fillna(X_test.mean())

In [59]:
results_algo = {}
for algo in model:
    clf = model[algo]
    clf.fit(X_train,y_train.astype(int))
    score = clf.score(X_test,y_test.astype(int))
    print ("%s : %s " %(algo, score))
    results_algo[algo] = score
    

RandomForest : 0.729323308271 
GradientBoosting : 0.769423558897 
GNB : 0.679197994987 
DecisionTree : 0.729323308271 
Adaboost : 0.719298245614 
Logistic Regression : 0.263130400753 


In [60]:
winner = max(results_algo, key=results_algo.get)

In [32]:
clf = model[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))

False positive rate : 19.211823 %
False negative rate : 25.510204 %


In [33]:
mt

array([[164,  39],
       [ 50, 146]])

In [34]:
test_data = "public meetings are awkard for me as I can insult people but I choose not to and that is something that I find difficult to live with"

In [101]:
test_data="I purchased this product 4.47 billion years ago and when I opened it today, it was half empty."

In [82]:
test_data="when people see me eating and ask me are you eating? No no I'm trying to choke myself to death #sarcastic"

In [102]:
test_feature = []
test_feature.append((get_features(test_data,topic_mod)))

done
[('I', 'PRP'), ('purchased', 'VBD'), ('this', 'DT'), ('product', 'NN'), ('4.47', 'CD'), ('billion', 'CD'), ('years', 'NNS'), ('ago', 'RB'), ('and', 'CC'), ('when', 'WRB'), ('I', 'PRP'), ('opened', 'VBD'), ('it', 'PRP'), ('today', 'NN'), (',', ','), ('it', 'PRP'), ('was', 'VBD'), ('half', 'JJ'), ('empty', 'JJ'), ('.', '.')]
[('I', 'PRP'), ('purchased', 'VBD'), ('this', 'DT'), ('product', 'NN'), ('4.47', 'CD'), ('billion', 'CD'), ('years', 'NNS'), ('ago', 'RB'), ('and', 'CC'), ('when', 'WRB')]
[('I', 'PRP'), ('opened', 'VBD'), ('it', 'PRP'), ('today', 'NN'), (',', ','), ('it', 'PRP'), ('was', 'VBD'), ('half', 'JJ'), ('empty', 'JJ'), ('.', '.')]
2


In [104]:
test_feature

[{'Blob Polarity': -0.13333333333333333,
  'Blob Subjectivity': 0.3333333333333333,
  'Capitalization': 0,
  'Negative Sentiment': 0.125,
  'POS_1': 3.0,
  'POS_2': 2.0,
  'POS_3': 3.0,
  'POS_4': 1.0,
  'Positive Sentiment': 0.375,
  'Topic :': 0.50250000000000306,
  'first half Blob Polarity': 0.0,
  'first half Blob Subjectivity': 0.0,
  'first half sentiment': 0.0,
  'negative Sentiment first half': 0.0,
  'negative Sentiment second half': 0.125,
  'positive Sentiment first half': 0.0,
  'positive Sentiment second half': 0.375,
  'second half Blob Polarity': -0.13333333333333333,
  'second half Blob Subjectivity': 0.3333333333333333,
  'second half sentiment': 0.25,
  'sentiment': 0.25}]

In [105]:
c = []

c.append(pd.DataFrame(test_feature[0],index=[i]))

test_result = pd.concat(c)
test_result = test_result.drop(['Topic :'],axis=1).values

In [106]:
res= clf.predict(test_result)

In [107]:
res

array([ 0.33290802])

In [108]:
if a > 0.35:
    print ("Not Sarcastic")
else:
    print ("Sarcastic")

Sarcastic
