In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
import numpy as np

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/mac/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
def text_preprocess(text,process_list):
    
    #tokenization
    tokens = nltk.word_tokenize(text)
    
    #remove punctuation
    if('punc_removal' in process_list):
        tokens = [word for word in tokens if (word.isalpha())]
        tokens = [word.lower() for word in tokens]
    
    #stemming
    if('stemming' in process_list):
        p_stemmer = PorterStemmer()
        stemedList = []
        for word in tokens:
            stemedList.append(p_stemmer.stem(word))
        tokens = stemedList
    
    #Lemmatization
    if('lemmatization' in process_list):
        wordnet_lemmatizer = WordNetLemmatizer()
        lemmaList = []
        for word in tokens:
            lemmaList.append(wordnet_lemmatizer.lemmatize(word))
        tokens = lemmaList
        
    #Filter stopword
    if('stopword' in process_list):
        filtered_text = []  
        stop_words = set(stopwords.words("english"))
        for word in tokens:  
            if word not in stop_words:  
                filtered_text.append(word)
        tokens = filtered_text
    return " ".join(tokens)

def corpus_preprocess(corpus,process_list):
    new_corpus = []
    for i in range(len(corpus)):
        new_corpus.append(text_preprocess(corpus[i],process_list))
    return new_corpus

# spam-ham

In [4]:
spam_df = pd.read_csv('spam.csv')

In [5]:
spam_df.dropna(axis=1,inplace = True)

In [6]:
spam_df.rename(columns = {'v1' : 'type', 'v2' : 'message'}, inplace = True)

In [7]:
spam_df['length'] = spam_df['message'].apply(lambda x: len(x))
spam_df['count?'] = spam_df['message'].apply(lambda x: x.count('?'))
spam_df['count!'] = spam_df['message'].apply(lambda x: x.count('!'))

In [8]:
spam_df

Unnamed: 0,type,message,length,count?,count!
0,ham,"Go until jurong point, crazy.. Available only ...",111,0,0
1,ham,Ok lar... Joking wif u oni...,29,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,0,0
3,ham,U dun say so early hor... U c already then say...,49,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,0,0
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,0,1
5568,ham,Will �_ b going to esplanade fr home?,37,1,0
5569,ham,"Pity, * was in mood for that. So...any other s...",57,1,0
5570,ham,The guy did some bitching but I acted like i'd...,125,0,0


In [9]:
x_mail_train,x_mail_test,y_mail_train,y_mail_test = train_test_split(spam_df[['message','length','count?','count!']],spam_df['type'],test_size=0.2,random_state=42)

## preprocess, bow

In [10]:
process_list = [[],['punc_removal'],['stemming'],['lemmatization'],
                ['stopword'],['punc_removal','lemmatization','stopword'],['lemmatization','stopword']]
for process in process_list:
    print(process)
    x_train = corpus_preprocess(x_mail_train['message'].values,process)
    x_test = corpus_preprocess(x_mail_test['message'].values,process)
    bow_transformer = CountVectorizer(lowercase=False).fit(x_train)
    messages_bow_train = bow_transformer.transform(x_train)
    messages_bow_test = bow_transformer.transform(x_test)
    clf = MultinomialNB().fit(messages_bow_train,y_mail_train)
    y_pred = clf.predict(messages_bow_test)
    print(classification_report(y_mail_test,y_pred,digits=4))

[]
              precision    recall  f1-score   support

         ham     0.9827    0.9979    0.9902       965
        spam     0.9852    0.8867    0.9333       150

    accuracy                         0.9830      1115
   macro avg     0.9839    0.9423    0.9618      1115
weighted avg     0.9830    0.9830    0.9826      1115

['punc_removal']
              precision    recall  f1-score   support

         ham     0.9806    0.9948    0.9877       965
        spam     0.9632    0.8733    0.9161       150

    accuracy                         0.9785      1115
   macro avg     0.9719    0.9341    0.9519      1115
weighted avg     0.9783    0.9785    0.9780      1115

['stemming']
              precision    recall  f1-score   support

         ham     0.9826    0.9959    0.9892       965
        spam     0.9708    0.8867    0.9268       150

    accuracy                         0.9812      1115
   macro avg     0.9767    0.9413    0.9580      1115
weighted avg     0.9810    0.9812    0.98

## preprocess, tf-idf

In [11]:
x_train = corpus_preprocess(x_mail_train['message'].values,['stopword'])
x_test = corpus_preprocess(x_mail_test['message'].values,['stopword'])
bow_transformer = CountVectorizer(lowercase=False).fit(x_train)
messages_bow_train = bow_transformer.transform(x_train)
tfidf_transformer = TfidfTransformer().fit(messages_bow_train)
messages_tfidf_train = tfidf_transformer.transform(messages_bow_train)
messages_bow_test = bow_transformer.transform(x_test)
messages_tfidf_test = tfidf_transformer.transform(messages_bow_test)
clf = MultinomialNB().fit(messages_tfidf_train,y_mail_train)
y_pred = clf.predict(messages_tfidf_test)
print(classification_report(y_mail_test,y_pred,digits=4))

              precision    recall  f1-score   support

         ham     0.9660    1.0000    0.9827       965
        spam     1.0000    0.7733    0.8722       150

    accuracy                         0.9695      1115
   macro avg     0.9830    0.8867    0.9274      1115
weighted avg     0.9705    0.9695    0.9678      1115



## preprocess, bow, added feautures

In [12]:
train_features = np.hstack((messages_bow_train.toarray(),x_mail_train[['length','count?','count!']].to_numpy()))
test_features = np.hstack((messages_bow_test.toarray(),x_mail_test[['length','count?','count!']].to_numpy()))
clf = MultinomialNB().fit(train_features,y_mail_train)
y_pred = clf.predict(test_features)
print(classification_report(y_mail_test,y_pred,digits=4))

              precision    recall  f1-score   support

         ham     0.9777    1.0000    0.9887       965
        spam     1.0000    0.8533    0.9209       150

    accuracy                         0.9803      1115
   macro avg     0.9889    0.9267    0.9548      1115
weighted avg     0.9807    0.9803    0.9796      1115



##  added feautures

In [13]:
train_features = x_mail_train[['length','count?','count!']]
test_features = x_mail_test[['length','count?','count!']]
clf = MultinomialNB().fit(train_features,y_mail_train)
y_pred = clf.predict(test_features)
print(classification_report(y_mail_test,y_pred,digits=4))

              precision    recall  f1-score   support

         ham     0.8704    0.9948    0.9284       965
        spam     0.5833    0.0467    0.0864       150

    accuracy                         0.8673      1115
   macro avg     0.7268    0.5207    0.5074      1115
weighted avg     0.8317    0.8673    0.8152      1115



# Liar

In [14]:
train_df = pd.read_csv('SentimentalLIAR-master/train_final.csv')
test_df = pd.read_csv('SentimentalLIAR-master/test_final.csv')
liar_train = train_df.copy()[['statement','label','sentiment','anger','fear','joy','disgust','sad']]
liar_test = test_df.copy()[['statement','label','sentiment','anger','fear','joy','disgust','sad']]                    

In [15]:
liar_train['label'] = train_df['label'].replace({'barely-true':0,'pants-fire':0,'false':0,'half-true':1,'mostly-true':1,'true':1})
liar_train['sentiment'] = train_df['sentiment'].replace({'NEGATIVE':0,'POSITIVE':1})

In [16]:
liar_test['label'] = test_df['label'].replace({'barely-true':0,'pants-fire':0,'false':0,'half-true':1,'mostly-true':1,'true':1})
liar_test['sentiment'] = test_df['sentiment'].replace({'NEGATIVE':0,'POSITIVE':1})

In [17]:
liar_train['length'] = liar_train['statement'].apply(lambda x: len(x))
liar_train['count?'] = liar_train['statement'].apply(lambda x: x.count('?'))
liar_train['count!'] = liar_train['statement'].apply(lambda x: x.count('!'))
features = ['anger','fear','joy','disgust','sad']
for feature in features:
    liar_train[feature] = liar_train[feature].apply(lambda x: round(x*100))

In [18]:
liar_train

Unnamed: 0,statement,label,sentiment,anger,fear,joy,disgust,sad,length,count?,count!
0,Says the Annies List political group supports ...,0,0.0,12,1,3,26,53,82,0,0
1,When did the decline of coal start? It started...,1,0.0,10,12,19,2,10,141,1,0
2,"Hillary Clinton agrees with John McCain ""by vo...",1,0.0,4,2,50,45,5,105,0,0
3,Health care reform legislation is likely to ma...,0,0.0,0,19,38,2,38,78,0,0
4,The economic turnaround started at the end of ...,1,,4,22,22,5,27,54,0,0
...,...,...,...,...,...,...,...,...,...,...,...
10231,There are a larger number of shark attacks in ...,1,0.0,35,29,2,18,22,90,0,0
10232,Democrats have now become the party of the [At...,1,,19,8,5,20,41,78,0,0
10233,Says an alternative to Social Security that op...,1,0.0,12,21,15,10,22,176,0,0
10234,On lifting the U.S. Cuban embargo and allowing...,0,,11,7,31,24,8,62,0,0


In [19]:
liar_test['length'] = liar_test['statement'].apply(lambda x: len(x))
liar_test['count?'] = liar_test['statement'].apply(lambda x: x.count('?'))
liar_test['count!'] = liar_test['statement'].apply(lambda x: x.count('!'))
features = ['anger','fear','joy','disgust','sad']
for feature in features:
    liar_test[feature] = liar_test[feature].apply(lambda x: round(x*100))

## preprocess

In [20]:
process_list = [[],['punc_removal'],['stemming'],['lemmatization'],
                ['stopword'],['stopword','stemming'],['stemming','lemmatization']]
for process in process_list:
    print(process)
    x_train = corpus_preprocess(liar_train['statement'].values,process)
    x_test = corpus_preprocess(liar_test['statement'].values,process)
    bow_transformer = CountVectorizer(lowercase=False).fit(x_train)
    messages_bow_train = bow_transformer.transform(x_train)
    messages_bow_test = bow_transformer.transform(x_test)
    clf = ComplementNB().fit(messages_bow_train,liar_train['label'])
    y_pred = clf.predict(messages_bow_test)
    print(classification_report(liar_test['label'],y_pred,digits=4))

[]
              precision    recall  f1-score   support

           0     0.5660    0.5118    0.5375       553
           1     0.6480    0.6961    0.6712       714

    accuracy                         0.6156      1267
   macro avg     0.6070    0.6039    0.6043      1267
weighted avg     0.6122    0.6156    0.6128      1267

['punc_removal']
              precision    recall  f1-score   support

           0     0.5562    0.5099    0.5321       553
           1     0.6434    0.6849    0.6635       714

    accuracy                         0.6085      1267
   macro avg     0.5998    0.5974    0.5978      1267
weighted avg     0.6054    0.6085    0.6061      1267

['stemming']
              precision    recall  f1-score   support

           0     0.5633    0.5389    0.5508       553
           1     0.6545    0.6765    0.6653       714

    accuracy                         0.6164      1267
   macro avg     0.6089    0.6077    0.6081      1267
weighted avg     0.6147    0.6164    0.61

## preprocess, tf-idf

In [21]:
x_train = corpus_preprocess(liar_train['statement'].values,['lemmatization'])
x_test = corpus_preprocess(liar_test['statement'].values,['lemmatization'])
bow_transformer = CountVectorizer(lowercase=False).fit(x_train)
messages_bow_train = bow_transformer.transform(x_train)
tfidf_transformer = TfidfTransformer().fit(messages_bow_train)
messages_tfidf_train = tfidf_transformer.transform(messages_bow_train)
messages_bow_test = bow_transformer.transform(liar_test['statement'])
messages_tfidf_test = tfidf_transformer.transform(messages_bow_test)
clf = ComplementNB().fit(messages_tfidf_train,liar_train['label'])
y_pred = clf.predict(messages_bow_test)
print(classification_report(liar_test['label'],y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.5924    0.3363    0.4291       553
           1     0.6149    0.8207    0.7031       714

    accuracy                         0.6093      1267
   macro avg     0.6036    0.5785    0.5661      1267
weighted avg     0.6051    0.6093    0.5835      1267



In [22]:
corr = liar_train.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,label,sentiment,anger,fear,joy,disgust,sad,length,count?,count!
label,1.0,0.043842,-0.060934,0.02614,0.016214,-0.050029,0.055575,0.041418,-0.006885,-0.02362
sentiment,0.043842,1.0,-0.178887,-0.092071,0.349332,-0.152665,-0.20075,-0.056388,-0.029659,0.042919
anger,-0.060934,-0.178887,1.0,0.077972,-0.398449,0.315868,-0.004062,0.028581,0.016014,0.01704
fear,0.02614,-0.092071,0.077972,1.0,-0.243583,-0.116576,0.157458,-0.001854,0.006811,-0.011252
joy,0.016214,0.349332,-0.398449,-0.243583,1.0,-0.392577,-0.328222,-0.014797,-0.024881,0.039898
disgust,-0.050029,-0.152665,0.315868,-0.116576,-0.392577,1.0,-0.06441,0.060148,-0.011531,0.000827
sad,0.055575,-0.20075,-0.004062,0.157458,-0.328222,-0.06441,1.0,0.069889,-0.008188,-0.039079
length,0.041418,-0.056388,0.028581,-0.001854,-0.014797,0.060148,0.069889,1.0,0.025607,0.005626
count?,-0.006885,-0.029659,0.016014,0.006811,-0.024881,-0.011531,-0.008188,0.025607,1.0,0.034767
count!,-0.02362,0.042919,0.01704,-0.011252,0.039898,0.000827,-0.039079,0.005626,0.034767,1.0


## added feature without bow

In [23]:
clf = ComplementNB().fit(liar_train[['length','count?','count!']+features],liar_train['label'])
y_pred = clf.predict(liar_test[['length','count?','count!']+features])
print(classification_report(liar_test['label'],y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.4907    0.4792    0.4849       553
           1     0.6039    0.6148    0.6093       714

    accuracy                         0.5556      1267
   macro avg     0.5473    0.5470    0.5471      1267
weighted avg     0.5545    0.5556    0.5550      1267



## bow,pre-process,added-features

In [24]:
train_features = np.hstack((messages_bow_train.toarray(),liar_train[['length','count?','count!']+features].to_numpy()))
test_features = np.hstack((messages_bow_test.toarray(),liar_test[['length','count?','count!']+features].to_numpy()))
clf = ComplementNB().fit(train_features,liar_train['label'])
y_pred = clf.predict(test_features)
print(classification_report(liar_test['label'],y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.5437    0.5515    0.5476       553
           1     0.6487    0.6415    0.6451       714

    accuracy                         0.6022      1267
   macro avg     0.5962    0.5965    0.5963      1267
weighted avg     0.6029    0.6022    0.6025      1267



In [25]:
for feature in features:
    print(feature)
    train_features = np.concatenate((messages_bow_train.toarray(),liar_train[feature].to_numpy().reshape(-1, 1)), axis=1)
    test_features = np.concatenate((messages_bow_test.toarray(),liar_test[feature].to_numpy().reshape(-1, 1)), axis=1)
    clf = ComplementNB().fit(train_features,liar_train['label'])
    y_pred = clf.predict(test_features)
    print(classification_report(liar_test['label'],y_pred,digits=4))
    

anger
              precision    recall  f1-score   support

           0     0.5648    0.5986    0.5812       553
           1     0.6740    0.6429    0.6581       714

    accuracy                         0.6235      1267
   macro avg     0.6194    0.6207    0.6196      1267
weighted avg     0.6264    0.6235    0.6245      1267

fear
              precision    recall  f1-score   support

           0     0.5720    0.5389    0.5549       553
           1     0.6582    0.6877    0.6726       714

    accuracy                         0.6227      1267
   macro avg     0.6151    0.6133    0.6138      1267
weighted avg     0.6206    0.6227    0.6212      1267

joy
              precision    recall  f1-score   support

           0     0.5725    0.5425    0.5571       553
           1     0.6595    0.6863    0.6726       714

    accuracy                         0.6235      1267
   macro avg     0.6160    0.6144    0.6149      1267
weighted avg     0.6215    0.6235    0.6222      1267

disg