In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [2]:
df=pd.read_csv('news.csv')
print(df.shape)
df.head()

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [4]:
#splitting training 80% and testing 20% 
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

In [5]:
#frequent words above 0.7 are not considered and stop words are removed
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7) 

In [6]:
#creating tfidf matrix of docs
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [7]:
 # get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_train[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
vicino,0.621315
underground,0.229076
denver,0.182741
elite,0.167129
shelters,0.141529
...,...
fatalities,0.000000
fatality,0.000000
fatally,0.000000
fatcats,0.000000


In [8]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
f1_score(y_test,y_pred, pos_label="FAKE")

['REAL' 'FAKE' 'REAL' ... 'REAL' 'FAKE' 'REAL']
Accuracy: 93.13%
[[591  47]
 [ 40 589]]


0.9314420803782506

In [9]:
pac=LogisticRegression(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
f1_score(y_test,y_pred, pos_label="FAKE")

['REAL' 'FAKE' 'REAL' ... 'REAL' 'FAKE' 'REAL']
Accuracy: 91.71%
[[600  38]
 [ 67 562]]


0.9195402298850576

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
import numpy as np
from sklearn import preprocessing
# model.fit(tfidf_train,y_train)
model = LogisticRegression()
# param_grid = {'n_estimators': [30,50,100]}  #5 folds x 4 params = 20 models
# Create regularization penalty space
max_iter = [30,50,100]

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, max_iter=max_iter)
cv = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
grid = GridSearchCV(model, hyperparameters, cv = cv, 
                    return_train_score=True, scoring='accuracy')
nor=Normalizer()
tfidf_train=nor.transform(tfidf_train)
# tfidf_test=preprocessing.scale(tfidf_test)
grid.fit(tfidf_train, y_train)
print("Best Parameter: {}".format(grid.best_params_))
print("Best Cross Vlidation Score: {}".format(grid.best_score_))
bestModel = grid.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Best Parameter: {'C': 464.15888336127773, 'max_iter': 30}
Best Cross Vlidation Score: 0.9352794344137649


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
y_train_hat  = bestModel.predict(tfidf_train)
y_test_hat  = bestModel.predict(tfidf_test)

in_sample_acc = accuracy_score(y_train,y_train_hat, normalize = True) * 100
out_of_sample_acc = accuracy_score(y_test,y_test_hat, normalize = True) * 100
print("In-sample Accuracy: ", in_sample_acc)
print("Out-of-sample Accuracy: ", out_of_sample_acc)
print(confusion_matrix(y_test,y_test_hat, labels=['FAKE','REAL']))
f1_score(y_test,y_test_hat, pos_label="FAKE")

In-sample Accuracy:  100.0
Out-of-sample Accuracy:  92.73875295974744
[[596  42]
 [ 50 579]]


0.9283489096573209

In [10]:
pac=SGDClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
f1_score(y_test,y_pred, pos_label="FAKE")

['REAL' 'FAKE' 'REAL' ... 'REAL' 'FAKE' 'REAL']
Accuracy: 92.98%
[[600  38]
 [ 51 578]]


0.9309542280837859

In [11]:
from sklearn.tree import DecisionTreeClassifier
pac = DecisionTreeClassifier()
pac.fit(tfidf_train, y_train)
# pac=PassiveAggressiveClassifier(max_iter=50)
# pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
f1_score(y_test,y_pred, pos_label="FAKE")

['REAL' 'FAKE' 'REAL' ... 'REAL' 'FAKE' 'FAKE']
Accuracy: 80.35%
[[514 124]
 [125 504]]


0.8050117462803446

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# model.fit(tfidf_train,y_train)
model = RandomForestClassifier()
param_grid = {'n_estimators': [30,50,100]}  #5 folds x 4 params = 20 models
cv = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
grid = GridSearchCV(model, param_grid, cv = cv, 
                    return_train_score=True, scoring='accuracy')
grid.fit(tfidf_train, y_train)
print("Best Parameter: {}".format(grid.best_params_))
print("Best Cross Vlidation Score: {}".format(grid.best_score_))
bestModel = grid.best_estimator_
# let's check in-sample and out-of-sample accuracy
# y_train_hat  = bestModel.predict(tfidf_train)
# y_test_hat  = bestModel.predict(tfidf_test)

# in_sample_acc = accuracy_score(y_train,y_train_hat, normalize = True) * 100
# out_of_sample_acc = accuracy_score(y_test,y_test_hat, normalize = True) * 100
# print("In-sample Accuracy: ", in_sample_acc)
# print("Out-of-sample Accuracy: ", out_of_sample_acc)
# print(confusion_matrix(y_test,y_test_hat, labels=['FAKE','REAL']))
# f1_score(y_test,y_test_hat, pos_label="FAKE")

Best Parameter: {'n_estimators': 100}
Best Cross Vlidation Score: 0.9009501724134574


In [18]:
y_train_hat  = bestModel.predict(tfidf_train)
y_test_hat  = bestModel.predict(tfidf_test)

in_sample_acc = accuracy_score(y_train,y_train_hat, normalize = True) * 100
out_of_sample_acc = accuracy_score(y_test,y_test_hat, normalize = True) * 100
print("In-sample Accuracy: ", in_sample_acc)
print("Out-of-sample Accuracy: ", out_of_sample_acc)
print(confusion_matrix(y_test,y_test_hat, labels=['FAKE','REAL']))
f1_score(y_test,y_test_hat, pos_label="FAKE")

In-sample Accuracy:  100.0
Out-of-sample Accuracy:  89.66061562746646
[[568  70]
 [ 61 568]]


0.8966061562746646

In [34]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7)
classifier.fit(tfidf_train, y_train)
y_pred=classifier.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
f1_score(y_test,y_pred, pos_label="FAKE")

['FAKE' 'FAKE' 'FAKE' ..., 'FAKE' 'FAKE' 'FAKE']
Accuracy: 55.25%
[[638   0]
 [567  62]]


0.6923494302767228

In [35]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(tfidf_train, y_train)
y_pred=svclassifier.predict(tfidf_test)
print(y_pred)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
f1_score(y_test,y_pred, pos_label="FAKE")
# classification_report(y_test,y_pred)

['REAL' 'FAKE' 'REAL' ..., 'REAL' 'FAKE' 'REAL']
Accuracy: 93.05%
[[598  40]
 [ 48 581]]


0.93146417445482865