In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.pipeline import Pipeline

In [2]:
news=pd.read_csv("final.csv")

In [3]:
news.head()

Unnamed: 0,title,news
0,donald trump sends embarrassing new year eve m...,0
1,drunk bragging trump staffer started russian c...,0
2,sheriff david clarke becomes internet joke thr...,0
3,trump obsessed even obama coded website image,0
4,pope francis called donald trump christmas speech,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(news["title"], news['news'], test_size=0.25, random_state=42)

In [5]:
pipe1 = Pipeline([('vect', TfidfVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier())])

In [6]:
model1 = pipe1.fit(X_train, y_train)
prediction1 = model1.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction1)*100,2)))

accuracy: 89.76%


In [7]:
print(confusion_matrix(y_test,prediction1))

[[5282  637]
 [ 513 4796]]


In [8]:
print(classification_report(y_test,prediction1))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      5919
           1       0.88      0.90      0.89      5309

    accuracy                           0.90     11228
   macro avg       0.90      0.90      0.90     11228
weighted avg       0.90      0.90      0.90     11228



In [9]:
pipe2 = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2))),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier())])

In [10]:
model2 = pipe2.fit(X_train, y_train)
prediction2 = model2.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction2)*100,2)))

accuracy: 84.96%


In [11]:
print(confusion_matrix(y_test,prediction2))

[[5102  817]
 [ 872 4437]]


In [12]:
print(classification_report(y_test,prediction2))

              precision    recall  f1-score   support

           0       0.85      0.86      0.86      5919
           1       0.84      0.84      0.84      5309

    accuracy                           0.85     11228
   macro avg       0.85      0.85      0.85     11228
weighted avg       0.85      0.85      0.85     11228



In [13]:
pipe3 = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 3))),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier())])

In [14]:
model3 = pipe3.fit(X_train, y_train)
prediction3 = model3.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction3)*100,2)))

accuracy: 83.96%


In [15]:
print(confusion_matrix(y_test,prediction3))

[[5107  812]
 [ 989 4320]]


In [16]:
print(classification_report(y_test,prediction3))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      5919
           1       0.84      0.81      0.83      5309

    accuracy                           0.84     11228
   macro avg       0.84      0.84      0.84     11228
weighted avg       0.84      0.84      0.84     11228

