In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [44]:
df=pd.read_csv('news.csv')
print(df.shape)
df.head()

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [45]:
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [46]:
#splitting training 80% and testing 20% 
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

In [47]:
#frequent words above 0.7 are not considered and stop words are removed
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7) 

In [48]:
#creating tfidf matrix of docs
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [49]:
 # get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_train[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
vicino,0.621315
underground,0.229076
denver,0.182741
elite,0.167129
shelters,0.141529
global,0.133873
apocalyptic,0.125240
underneath,0.124509
imminent,0.114538
survival,0.102683


In [66]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

['REAL' 'FAKE' 'REAL' ..., 'REAL' 'FAKE' 'REAL']
Accuracy: 92.74%


array([[588,  50],
       [ 42, 587]])

In [67]:
pac=LogisticRegression(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

['REAL' 'FAKE' 'REAL' ..., 'REAL' 'FAKE' 'REAL']
Accuracy: 91.71%


array([[600,  38],
       [ 67, 562]])

In [68]:
pac=SGDClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

['REAL' 'FAKE' 'REAL' ..., 'REAL' 'FAKE' 'REAL']
Accuracy: 92.74%


array([[597,  41],
       [ 51, 578]])