In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.pipeline import Pipeline

In [2]:
news=pd.read_csv("final.csv")

In [3]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44910 entries, 0 to 44909
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   44910 non-null  object
 1   news    44910 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.8+ KB


In [4]:
X_train, X_test, y_train, y_test = train_test_split(news["title"], news['news'], test_size=0.25, random_state=42)

# Model 1

In [5]:
pipe1 = Pipeline([('vect', TfidfVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LinearSVC())])

In [6]:
model1 = pipe1.fit(X_train, y_train)
prediction1 = model1.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction1)*100,2)))

accuracy: 94.48%


In [7]:
print(confusion_matrix(y_test,prediction1))

[[5617  302]
 [ 318 4991]]


In [8]:
print(classification_report(y_test,prediction1))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      5919
           1       0.94      0.94      0.94      5309

    accuracy                           0.94     11228
   macro avg       0.94      0.94      0.94     11228
weighted avg       0.94      0.94      0.94     11228



# Model 2

In [9]:
pipe2 = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2))),
                 ('tfidf', TfidfTransformer()),
                 ('model', LinearSVC())])

In [10]:
model2 = pipe2.fit(X_train, y_train)
prediction2 = model2.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction2)*100,2)))

accuracy: 95.38%


In [11]:
print(confusion_matrix(y_test,prediction2))

[[5700  219]
 [ 300 5009]]


In [12]:
print(classification_report(y_test,prediction2))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      5919
           1       0.96      0.94      0.95      5309

    accuracy                           0.95     11228
   macro avg       0.95      0.95      0.95     11228
weighted avg       0.95      0.95      0.95     11228



# Model 3

In [13]:
pipe3 = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 3))),
                 ('tfidf', TfidfTransformer()),
                 ('model', LinearSVC())])

In [14]:
model3 = pipe3.fit(X_train, y_train)
prediction3 = model3.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction3)*100,2)))

accuracy: 95.25%


In [15]:
print(confusion_matrix(y_test,prediction3))

[[5699  220]
 [ 313 4996]]


In [16]:
print(classification_report(y_test,prediction3))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      5919
           1       0.96      0.94      0.95      5309

    accuracy                           0.95     11228
   macro avg       0.95      0.95      0.95     11228
weighted avg       0.95      0.95      0.95     11228

