In [1]:
import itertools
import pandas as pd
import numpy as np
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from gensim.parsing.preprocessing import remove_stopwords
import re

ModuleNotFoundError: No module named 'gensim'

In [2]:
# Import dataset
df=pd.read_csv('train.csv')[0:5000]
df = df.dropna(subset=['text']).reset_index(drop=True)
df['text'] = df['text'].astype('str')
df['text'] = df.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', '', x['text']), axis = 1)
# Get the shape
df.shape

(4989, 5)

In [3]:
# Import dataset
df_test=pd.read_csv('test.csv')[0:1000]

# Get the shape
req_df = df_test[['text']]
# req_df['b_labels'] = req_df['label'].map({'REAL': 0, 'FAKE': 1})
req_df = req_df.dropna(subset=['text']).reset_index(drop=True)
req_df['text'] = req_df['text'].astype('str')
req_df['text'] = req_df.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', '', x['text']), axis = 1)
req_df['text'] = req_df.apply(lambda x: remove_stopwords(x['text']), axis = 1)
print(req_df.shape)

(997, 1)


In [4]:
# Get the head
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide We Didnt Even See Comeys Letter...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,Why the Truth Might Get You Fired October 29 2...,1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print An Iranian woman has been sentenced to s...,1


In [5]:
# Change the labels
df.loc[(df['label'] == 1) , ['label']] = 'FAKE'
df.loc[(df['label'] == 0) , ['label']] = 'REAL'
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide We Didnt Even See Comeys Letter...,FAKE
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,REAL
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,Why the Truth Might Get You Fired October 29 2...,FAKE
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,FAKE
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print An Iranian woman has been sentenced to s...,FAKE


In [6]:
# Isolate the labels
labels = df.label
labels.head()

0    FAKE
1    REAL
2    FAKE
3    FAKE
4    FAKE
Name: label, dtype: object

In [7]:
x_train,x_test,y_train,y_test=train_test_split(df['text'].values.astype('str'), labels, test_size=0.2, random_state=7)

In [8]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english')

In [9]:
# Fit & transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [10]:

# Initialize the PassiveAggressiveClassifier and fit training sets
pa_classifier=PassiveAggressiveClassifier(max_iter=50, random_state=0)
pa_classifier.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50, random_state=0)

In [11]:
# Predict and calculate training accuracy
y_pred=pa_classifier.predict(tfidf_train)
score=accuracy_score(y_train,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 100.0%


In [12]:
# Predict and calculate testing accuracy
y_pred=pa_classifier.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.99%


In [13]:
#SELF INPUT
news_index_num = 0 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',pa_classifier.predict(tfidf_self_input)[0])

#SELF INPUT
news_index_num = 10 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',pa_classifier.predict(tfidf_self_input)[0])

PALO ALTO Calif After years scorning political process Silicon Valley leapt fray The prospect President Donald J Trump pushing tech community traditional role donors embrace new existence agitators activists A distinguished venture capital firm emblazoned corporate home page earthy epithet One prominent tech chieftain says consequences Mr Trumps election range disastrous terrible Another compares dictator And nearly 150 tech leaders signed open letter decrying Mr Trump campaign anger bigotry Not action Peter Thiel founder PayPal Palantir outside investor Facebook spoke Republican convention July The New York Times reported Saturday Mr Thiel giving 1 25 million support Mr Trumps candidacy supporters flee He recently gave 1 million super PAC supports Senator Rob Portman Republican freshman running Ohio Getting involved politics seen clashing Silicon Valleys value You transform world making problems obsolete solving Washington Nor entrepreneurs want alienate segment customers agree politi

In [14]:
# new_news = ['Donald Trump died yesterday']
# new_news_tfidf=tfidf_vectorizer.transform(new_news)
# print('Donald Trump died yesterday:',pa_classifier.predict(new_news_tfidf)[0])

# new_news = ['FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart']
# new_news_tfidf=tfidf_vectorizer.transform(new_news)
# print('FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart:',pa_classifier.predict(new_news_tfidf)[0])

In [15]:
# Build confusion matrix
# confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])


In [16]:
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score
# from sklearn.metrics import f1_score
# from sklearn.metrics import confusion_matrix

# # accuracy: (tp + tn) / (p + n)
# accuracy = accuracy_score(y_test, y_pred)
# print('Accuracy: %f' % accuracy)
# # precision tp / (tp + fp)
# precision = precision_score(y_test, y_pred)
# print('Precision: %f' % precision)
# # recall: tp / (tp + fn)
# recall = recall_score(y_test, y_pred)
# print('Recall: %f' % recall)
# # f1: 2 tp / (2 tp + fp + fn)
# f1 = f1_score(y_test, y_pred)
# print('F1 score: %f' % f1)

In [17]:
# importing required libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(random_state=0)

x_train = tfidf_train
x_test = tfidf_test

# fit the model with the training data
model.fit(x_train,y_train)

# coefficeints of the trained model
#print('Coefficient of model :', model.coef_)

# intercept of the model
#print('Intercept of model',model.intercept_)

# predict the target on the train dataset
predict_train = model.predict(x_train)
# print('Target on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(x_test)
# print('Target on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)

Coefficient of model : [[-0.03061568  3.02209045  0.0235154  ... -0.00833129  0.00373136
  -0.01901582]]
Intercept of model [-1.07918804]
accuracy_score on train dataset :  0.9754447506890503
accuracy_score on test dataset :  0.9338677354709419


In [18]:
#SELF INPUT
news_index_num = 0 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

#SELF INPUT
news_index_num = 10 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

PALO ALTO Calif After years scorning political process Silicon Valley leapt fray The prospect President Donald J Trump pushing tech community traditional role donors embrace new existence agitators activists A distinguished venture capital firm emblazoned corporate home page earthy epithet One prominent tech chieftain says consequences Mr Trumps election range disastrous terrible Another compares dictator And nearly 150 tech leaders signed open letter decrying Mr Trump campaign anger bigotry Not action Peter Thiel founder PayPal Palantir outside investor Facebook spoke Republican convention July The New York Times reported Saturday Mr Thiel giving 1 25 million support Mr Trumps candidacy supporters flee He recently gave 1 million super PAC supports Senator Rob Portman Republican freshman running Ohio Getting involved politics seen clashing Silicon Valleys value You transform world making problems obsolete solving Washington Nor entrepreneurs want alienate segment customers agree politi

In [19]:
# new_news = ['Donald Trump died yesterday']
# new_news_tfidf=tfidf_vectorizer.transform(new_news)
# print('Donald Trump died yesterday:',model.predict(new_news_tfidf)[0])

# new_news = ['Donald Trump is the new president of America']
# new_news_tfidf=tfidf_vectorizer.transform(new_news)
# print('Donald Trump is the new president of America:',model.predict(new_news_tfidf)[0])

In [20]:
# importing required libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(random_state=0)

x_train = tfidf_train
x_test = tfidf_test

# fit the model with the training data
model.fit(x_train,y_train)

# depth of the decision tree
print('Depth of the Decision Tree :', model.get_depth())

# predict the target on the train dataset
predict_train = model.predict(x_train)
# print('Target on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(x_test)
# print('Target on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)

Depth of the Decision Tree : 61
accuracy_score on train dataset :  1.0
accuracy_score on test dataset :  0.8607214428857716


In [21]:
#SELF INPUT
news_index_num = 0 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

#SELF INPUT
news_index_num = 10 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

PALO ALTO Calif After years scorning political process Silicon Valley leapt fray The prospect President Donald J Trump pushing tech community traditional role donors embrace new existence agitators activists A distinguished venture capital firm emblazoned corporate home page earthy epithet One prominent tech chieftain says consequences Mr Trumps election range disastrous terrible Another compares dictator And nearly 150 tech leaders signed open letter decrying Mr Trump campaign anger bigotry Not action Peter Thiel founder PayPal Palantir outside investor Facebook spoke Republican convention July The New York Times reported Saturday Mr Thiel giving 1 25 million support Mr Trumps candidacy supporters flee He recently gave 1 million super PAC supports Senator Rob Portman Republican freshman running Ohio Getting involved politics seen clashing Silicon Valleys value You transform world making problems obsolete solving Washington Nor entrepreneurs want alienate segment customers agree politi

In [22]:
# importing required libraries
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

model = SVC(random_state=0)

train_x = tfidf_train
test_x = tfidf_test
train_y = y_train
test_y = y_test

# fit the model with the training data
model.fit(train_x,train_y)

# predict the target on the train dataset
predict_train = model.predict(train_x)
# print('Target on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(test_x)
# print('Target on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)

accuracy_score on train dataset :  0.9994988724630418
accuracy_score on test dataset :  0.935871743486974


In [23]:
#SELF INPUT
news_index_num = 0 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

#SELF INPUT
news_index_num = 10 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

PALO ALTO Calif After years scorning political process Silicon Valley leapt fray The prospect President Donald J Trump pushing tech community traditional role donors embrace new existence agitators activists A distinguished venture capital firm emblazoned corporate home page earthy epithet One prominent tech chieftain says consequences Mr Trumps election range disastrous terrible Another compares dictator And nearly 150 tech leaders signed open letter decrying Mr Trump campaign anger bigotry Not action Peter Thiel founder PayPal Palantir outside investor Facebook spoke Republican convention July The New York Times reported Saturday Mr Thiel giving 1 25 million support Mr Trumps candidacy supporters flee He recently gave 1 million super PAC supports Senator Rob Portman Republican freshman running Ohio Getting involved politics seen clashing Silicon Valleys value You transform world making problems obsolete solving Washington Nor entrepreneurs want alienate segment customers agree politi

In [24]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

model = KNeighborsClassifier()  

train_x = tfidf_train
test_x = tfidf_test
train_y = y_train
test_y = y_test

# fit the model with the training data
model.fit(train_x,train_y)

# Number of Neighbors used to predict the target
print('\nThe number of neighbors used to predict the target : ',model.n_neighbors)

# predict the target on the train dataset
predict_train = model.predict(train_x)
# print('\nTarget on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(test_x)
# print('Target on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)


The number of neighbors used to predict the target :  5
accuracy_score on train dataset :  0.5199198195940867
accuracy_score on test dataset :  0.5350701402805611


In [25]:
#SELF INPUT
news_index_num = 0 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

#SELF INPUT
news_index_num = 10 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

PALO ALTO Calif After years scorning political process Silicon Valley leapt fray The prospect President Donald J Trump pushing tech community traditional role donors embrace new existence agitators activists A distinguished venture capital firm emblazoned corporate home page earthy epithet One prominent tech chieftain says consequences Mr Trumps election range disastrous terrible Another compares dictator And nearly 150 tech leaders signed open letter decrying Mr Trump campaign anger bigotry Not action Peter Thiel founder PayPal Palantir outside investor Facebook spoke Republican convention July The New York Times reported Saturday Mr Thiel giving 1 25 million support Mr Trumps candidacy supporters flee He recently gave 1 million super PAC supports Senator Rob Portman Republican freshman running Ohio Getting involved politics seen clashing Silicon Valleys value You transform world making problems obsolete solving Washington Nor entrepreneurs want alienate segment customers agree politi

In [26]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(random_state=0, n_estimators=128)

train_x = tfidf_train
test_x = tfidf_test
train_y = y_train
test_y = y_test

# fit the model with the training data
model.fit(train_x,train_y)

# number of trees used
print('Number of Trees used : ', model.n_estimators)

# predict the target on the train dataset
predict_train = model.predict(train_x)
# print('\nTarget on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(test_x)
# print('\nTarget on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)

Number of Trees used :  128

accuracy_score on train dataset :  1.0

accuracy_score on test dataset :  0.9248496993987976


In [27]:
#SELF INPUT
news_index_num = 0 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

#SELF INPUT
news_index_num = 10 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input)[0])

PALO ALTO Calif After years scorning political process Silicon Valley leapt fray The prospect President Donald J Trump pushing tech community traditional role donors embrace new existence agitators activists A distinguished venture capital firm emblazoned corporate home page earthy epithet One prominent tech chieftain says consequences Mr Trumps election range disastrous terrible Another compares dictator And nearly 150 tech leaders signed open letter decrying Mr Trump campaign anger bigotry Not action Peter Thiel founder PayPal Palantir outside investor Facebook spoke Republican convention July The New York Times reported Saturday Mr Thiel giving 1 25 million support Mr Trumps candidacy supporters flee He recently gave 1 million super PAC supports Senator Rob Portman Republican freshman running Ohio Getting involved politics seen clashing Silicon Valleys value You transform world making problems obsolete solving Washington Nor entrepreneurs want alienate segment customers agree politi

In [32]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

train_x = tfidf_train
test_x = tfidf_test
train_y = y_train
test_y = y_test

model = GaussianNB()

# fit the model with the training data
model.fit(train_x.toarray(),train_y)

# predict the target on the train dataset
predict_train = model.predict(train_x.toarray())

# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(test_x.toarray())

# Accuracy Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)

accuracy_score on train dataset :  0.9797043347531947
accuracy_score on test dataset :  0.7945891783567134


In [35]:
#SELF INPUT
news_index_num = 0 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input.toarray())[0])

#SELF INPUT
news_index_num = 10 #Put any number between 0 to 996
tfidf_self_input=tfidf_vectorizer.transform([req_df['text'].iloc[news_index_num]])
print(req_df.text.iloc[news_index_num])
print('\nThe news is:',model.predict(tfidf_self_input.toarray())[0])

PALO ALTO Calif After years scorning political process Silicon Valley leapt fray The prospect President Donald J Trump pushing tech community traditional role donors embrace new existence agitators activists A distinguished venture capital firm emblazoned corporate home page earthy epithet One prominent tech chieftain says consequences Mr Trumps election range disastrous terrible Another compares dictator And nearly 150 tech leaders signed open letter decrying Mr Trump campaign anger bigotry Not action Peter Thiel founder PayPal Palantir outside investor Facebook spoke Republican convention July The New York Times reported Saturday Mr Thiel giving 1 25 million support Mr Trumps candidacy supporters flee He recently gave 1 million super PAC supports Senator Rob Portman Republican freshman running Ohio Getting involved politics seen clashing Silicon Valleys value You transform world making problems obsolete solving Washington Nor entrepreneurs want alienate segment customers agree politi