In [90]:
import pandas as pd
import numpy as np

# import machine learning libraries from sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [91]:
liar_train = pd.read_excel("liar_dataset/liar_train.xlsx", header=None, usecols=[1, 2])
liar_test = pd.read_excel("liar_dataset/liar_test.xlsx", header=None, usecols=[1, 2])
liar_valid = pd.read_excel("liar_dataset/liar_valid.xlsx", header=None, usecols=[1, 2])

In [92]:
liar_datasets = [liar_train, liar_test, liar_valid]
df = pd.concat(liar_datasets)

In [93]:
df.columns= ['label', 'text']

In [94]:
df.describe()

Unnamed: 0,label,text
count,12791,12791
unique,6,12765
top,half-true,On a cap-and-trade plan.
freq,2627,3


In [95]:
df.label.unique()

array([False, 'half-true', 'mostly-true', True, 'barely-true', 'pants-fire'], dtype=object)

In [96]:
# encode labels as numbers
df.loc[df['label'] == "pants-fire", 'label'] = 0
df.loc[df['label'] == False, 'label'] = 0
df.loc[df['label'] == "mostly-true", 'label'] = 1
df.loc[df['label'] == True, 'label'] = 1

# drop half-true
df = df.drop(df[df['label'] == "half-true"].index)
df = df.drop(df[df['label'] == "barely-true"].index)

In [97]:
# convert type to int for sklearn model
y = df.label.astype(int) 
y.shape

(6601,)

In [98]:
# check that the shapes of y and df match
df.drop("label", axis=1)
df.shape

(6601, 2)

In [131]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=42, shuffle=False)
print('Shape of test & train sets:')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Shape of test & train sets:
(4422,) (2179,) (4422,) (2179,)


### Vectorizers

In [100]:
# Bag of Words Vectorizer

# Initialize the 'count_vectorizer'
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [122]:
count_vectorizer.get_feature_names()

['00',
 '000',
 '02',
 '024',
 '033',
 '04',
 '05',
 '07',
 '09',
 '10',
 '100',
 '100th',
 '103',
 '10315',
 '104',
 '105',
 '106',
 '107',
 '1070',
 '109',
 '10932',
 '10th',
 '11',
 '110',
 '11023',
 '11191',
 '111th',
 '112',
 '112th',
 '114',
 '115',
 '11th',
 '12',
 '120',
 '12189',
 '124',
 '125',
 '12670',
 '128',
 '12853',
 '12th',
 '13',
 '130',
 '131',
 '133',
 '135',
 '136',
 '137',
 '13th',
 '14',
 '140',
 '141',
 '143',
 '144',
 '145',
 '14th',
 '15',
 '150',
 '1508',
 '154',
 '155',
 '156',
 '15th',
 '16',
 '160',
 '163',
 '165',
 '168',
 '168k',
 '16th',
 '17',
 '170',
 '172',
 '174',
 '176',
 '1789',
 '1790',
 '1792',
 '1798',
 '17th',
 '18',
 '180',
 '1800s',
 '181',
 '1835',
 '1888',
 '18th',
 '19',
 '190',
 '1912',
 '1915',
 '1917',
 '1920s',
 '1928',
 '1930',
 '1930s',
 '194',
 '1947',
 '195',
 '1950',
 '1954',
 '1956',
 '1958',
 '1960',
 '1960s',
 '1961',
 '1964',
 '1968',
 '1969',
 '1970s',
 '1972',
 '1973',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 

In [101]:
# TF-IDF Vectorizer

# Initialize the 'tfidf_vectorizer'
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

### Random Forest Classifier

In [102]:
rf = RandomForestClassifier()

rf.fit(count_train, y_train)
pred1 = rf.predict(count_test)
score = accuracy_score(y_test, pred1)
print("Bag of Words & Random Forest Classifier accuracy: %0.3f" % score)

rf.fit(tfidf_train, y_train)
pred2 = rf.predict(tfidf_test)
score = accuracy_score(y_test, pred2)
print("TF-IDF & Random Forest Classifier accuracy: %0.3f" % score)

Bag of Words & Random Forest Classifier accuracy: 0.589
TF-IDF & Random Forest Classifier accuracy: 0.601


In [129]:
rf.feature_importances = zip(count_vectorizer.get_feature_names(), rf.feature_importances_)

In [147]:
aaa = pd.DataFrame()
aaa['feature'] = count_vectorizer.get_feature_names()
aaa['importance'] = rf.feature_importances_

In [150]:
aaa.sort_values('importance',ascending=False)

Unnamed: 0,feature,importance
6385,says,0.009728
5330,percent,0.009236
1358,care,0.006114
4965,obama,0.005764
7928,year,0.005726
946,barack,0.005691
5591,president,0.004841
5327,people,0.004068
1,000,0.003853
5426,plan,0.003833


### Naive Bayes Classifier

In [103]:
clf = MultinomialNB() 

clf.fit(count_train, y_train)
pred3 = clf.predict(count_test)
score = accuracy_score(y_test, pred3)
print("Bag of Words & Naive Bayes Classifier accuracy: %0.3f" % score)

clf.fit(tfidf_train, y_train)
pred4 = clf.predict(tfidf_test)
score = accuracy_score(y_test, pred4)
print("TF-IDF & Naive Bayes Classifier accuracy: %0.3f" % score)

Bag of Words & Naive Bayes Classifier accuracy: 0.612
TF-IDF & Naive Bayes Classifier accuracy: 0.613


In [113]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [115]:
show_most_informative_features(count_vectorizer, clf)

	-9.6428	00             		-5.4593	says           
	-9.6428	02             		-5.4626	percent        
	-9.6428	04             		-5.9386	state          
	-9.6428	07             		-6.0439	years          
	-9.6428	1070           		-6.0564	000            
	-9.6428	10932          		-6.0961	tax            
	-9.6428	110            		-6.0969	year           
	-9.6428	111th          		-6.1238	states         
	-9.6428	112th          		-6.2972	million        
	-9.6428	115            		-6.3014	people         
	-9.6428	12189          		-6.3554	health         
	-9.6428	128            		-6.4081	country        
	-9.6428	133            		-6.4309	obama          
	-9.6428	137            		-6.4479	jobs           
	-9.6428	13th           		-6.4580	texas          
	-9.6428	140            		-6.5088	care           
	-9.6428	156            		-6.5335	taxes          
	-9.6428	17th           		-6.5406	new            
	-9.6428	1888           		-6.5503	president      
	-9.6428	190            		-6.5861	average        


In [104]:
results = pd.DataFrame(X_test)
results['pred'] = pred1
results['actual'] = y_test

In [105]:
results.head()

Unnamed: 0,text,pred,actual
7527,Go look on the West Point website and youll se...,0,1
5780,"$120,000 will be spent by taxpayers on Charlie...",1,1
76,What the facts say is ...the best scenario for...,0,0
7129,"Says according to the FBI, more people are kil...",1,1
1822,Says Marco Rubio said that felons should not h...,0,0


In [106]:
results[(results['actual'] == 0) & (results['pred'] == 0)].count()/results[results['actual'] == 0].count()

text      0.505102
pred      0.505102
actual    0.505102
dtype: float64