## Load Libraries
* Conda base environment used

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
import string
import nltk
from nltk import PorterStemmer
import re
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
# import gensim
# from gensim import corpora

from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/bansal.rahul/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bansal.rahul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bansal.rahul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load data

In [2]:
dire = "" ## enter your directory path
df_train = pd.read_csv(dire + "train.tsv",names = ['id','label','statement','subject','speaker', 'job', 
                                                             'state','party','barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c','venue'],
                    sep='\t')

df_test = pd.read_csv(dire + "test.tsv",names = ['id','label','statement','subject','speaker', 'job', 
                                                             'state','party','barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c','venue'],
                    sep='\t')

df_valid = pd.read_csv(dire + "valid.tsv",names = ['id','label','statement','subject','speaker', 'job', 
                                                             'state','party','barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c','venue'],
                    sep='\t')

In [3]:
df_train.columns

Index(['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state',
       'party', 'barely_true_c', 'false_c', 'half_true_c', 'mostly_true_c',
       'pants_on_fire_c', 'venue'],
      dtype='object')

In [4]:
df = (pd.concat([df_train, df_valid])).reset_index().drop("index", axis = 1)

In [5]:
df.head()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely_true_c,false_c,half_true_c,mostly_true_c,pants_on_fire_c,venue
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


## Fetching Subject List
* fetching top 100 subjects as per occurence

In [6]:
### get subject
df["subject_list"] = df['subject'].apply(lambda x: str(x).split(","))
df_test["subject_list"] = df_test['subject'].apply(lambda x: str(x).split(","))

multi_list = df['subject_list'].values
single_list = [item for sublist in multi_list for item in sublist]

df_subject = pd.DataFrame(single_list, columns =['subject'])
subject = list(df_subject.subject.value_counts()[0:100].index)

In [7]:
df.head()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely_true_c,false_c,half_true_c,mostly_true_c,pants_on_fire_c,venue,subject_list
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,[abortion]
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"[energy, history, job-accomplishments]"
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,[foreign-policy]
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,[health-care]
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"[economy, jobs]"


## Fetching speaker, job, state, party and venue list

In [8]:
## get speaker, job, state, party, venue
speaker = list(df.speaker.value_counts()[0:100].index)
job = list(df.job.value_counts()[0:50].index)
state = list(df.state.value_counts()[0:50].index)
party = list(df.party.value_counts()[0:20].index)
venue = list(df.venue.value_counts()[0:100].index)

In [9]:
liar_list = [df,df_test]

## Encoding Subject

In [10]:
## subject encoding
for i in liar_list:
    for value in subject:
        i[value] = i["subject_list"].apply(lambda x: (value in x))

## Encoding Speaker, Job, State, Party and Venue

In [11]:
## get speaker, job, state, party, venue encoding
to_encode = [('speaker',speaker),
            ('party',party),
            ('state',state),
            ('venue',venue),
            ('job',job)]
for i in liar_list:
    for var in to_encode:
        for value in var[1]:
            i[value] = i[var[0]].apply(lambda x: x==value)

## Maping labels

In [12]:
truth = {'false':0,'half-true':1,'mostly-true':1,'true':1.,'pants-fire':0,'barely-true':0} 


for i in liar_list:
    i['numer_truth'] = i['label'].apply(lambda x: truth[x])

## Calculating sentiment of the sentence
* Using SentimentIntensityAnalyzer Library

In [13]:
sid = SentimentIntensityAnalyzer()

ps = PorterStemmer()
wn = nltk.WordNetLemmatizer()

def clean_stem (sent, seq=False):
    temp1=sent
    temp2 = re.split('\W+',temp1)
    temp3 = [ps.stem(x) for x in temp2 if x not in stopwords]
    return temp3
def sentiment(x):
    score = sid.polarity_scores(x)
    return score

In [14]:
for i in liar_list:
    i['sentiment'] = i['statement'].apply(lambda x: sentiment(' '.join(clean_stem(x))))

In [15]:
dic = df.sentiment
temp = pd.DataFrame(dic.tolist())
df.drop("sentiment", axis = 1, inplace = True)
df = pd.concat([df, temp], axis = 1)

dic = df_test.sentiment
temp = pd.DataFrame(dic.tolist())
df_test.drop("sentiment", axis = 1, inplace = True)
df_test = pd.concat([df_test, temp], axis = 1)


In [16]:
sentiment_features = ["neg","neu","pos","compound"]
features_of_interest = state+subject+speaker+party+venue+job+sentiment_features

## Train and Test Features
* Features are created using subject, speaker, job, state, party, venue and sentiment score

In [17]:
train_features = np.array(df[features_of_interest])
train_labels = np.array(df["numer_truth"])
test_features = np.array(df_test[features_of_interest])
test_labels = np.array(df_test["numer_truth"])

## Random Forest Classifier
* Tree based methods works favourably with the categorical data

In [18]:
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, max_depth = 30, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [19]:
predictions = rf.predict(test_features)

## Metric

In [27]:
accuracy = (predictions==test_labels).sum()*100/len(predictions)
print("Accuracy of the model is: ", round(accuracy, 2))

Accuracy of the model is:  63.3
