In [18]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [2]:
news=pd.read_csv("C:\\Users\\SHARIB\\Downloads\\balanced_news.csv")

In [3]:
news.head()

Unnamed: 0,text,label
0,,0
1,The Senate voted 51-48 this afternoon to proce...,0
2,So much for the SCOTUS not being political Che...,0
3,White House counselor Kellyanne Conway crawled...,0
4,Donald Trump may have decided that Russia is g...,0


In [4]:
news[news['label']==0].count()

text     5000
label    5000
dtype: int64

In [6]:
news[news['label']==1].count()

text     5000
label    5000
dtype: int64

In [7]:
news.shape

(10000, 2)

In [8]:
X_train,X_test,y_train,y_test=train_test_split(news.text,news.label,test_size=0.2,random_state=2022,stratify=news.label)
X_train.shape

(8000,)

In [9]:
X_test.shape

(2000,)

In [10]:
clf=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1, 3))),
    ('KNN',KNeighborsClassifier(n_neighbors=10,metric='euclidean'))
])
clf.fit(X_train,y_train)

In [12]:
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.66      0.67      1000
           1       0.67      0.70      0.69      1000

    accuracy                           0.68      2000
   macro avg       0.68      0.68      0.68      2000
weighted avg       0.68      0.68      0.68      2000



In [13]:
clf2=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1, 3))),
    ('KNN',KNeighborsClassifier(n_neighbors=10,metric='cosine'))
])
clf2.fit(X_train,y_train)

In [15]:
y_pred2=clf2.predict(X_test)
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.57      0.97      0.72      1000
           1       0.91      0.27      0.42      1000

    accuracy                           0.62      2000
   macro avg       0.74      0.62      0.57      2000
weighted avg       0.74      0.62      0.57      2000



In [16]:
clf3=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(3,3))),
    ('RandomForest',RandomForestClassifier())
])
clf3.fit(X_train,y_train)

In [17]:
y_pred3=clf3.predict(X_test)
print(classification_report(y_test,y_pred3))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      1000
           1       0.95      0.92      0.93      1000

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000



In [19]:
clf4=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(3,3))),
    ('NaiveBayes',MultinomialNB(alpha=0.75))
])
clf4.fit(X_train,y_train)

In [20]:
y_pred4=clf4.predict(X_test)
print(classification_report(y_test,y_pred4))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      1000
           1       0.94      0.99      0.96      1000

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



In [21]:
clf5=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('NaiveBayes',MultinomialNB(alpha=0.75))
])
clf5.fit(X_train,y_train)

In [22]:
y_pred5=clf5.predict(X_test)
print(classification_report(y_test,y_pred5))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1000
           1       0.96      0.96      0.96      1000

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



In [23]:
nlp=spacy.load('en_core_web_sm')

In [24]:
def preprocess(text):
    doc=nlp(text)
    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)    

In [26]:
news['preprocessed_text']=news['text'].apply(preprocess)

In [27]:
news.head()

Unnamed: 0,text,label,preprocessed_text
0,,0,
1,The Senate voted 51-48 this afternoon to proce...,0,Senate vote 51 48 afternoon proceed resolution...
2,So much for the SCOTUS not being political Che...,0,scotus political check comment equality woman ...
3,White House counselor Kellyanne Conway crawled...,0,White House counselor Kellyanne Conway crawl c...
4,Donald Trump may have decided that Russia is g...,0,Donald Trump decide Russia go America s new bf...


In [28]:
X_train,X_test,y_train,y_test=train_test_split(news.preprocessed_text,news.label,test_size=0.2,random_state=2022,stratify=news.label)

In [29]:
clf11=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(3,3))),
    ('RandomForest',RandomForestClassifier())
])
clf11.fit(X_train,y_train)

In [30]:
y_pred11=clf11.predict(X_test)
print(classification_report(y_test,y_pred11))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1000
           1       0.96      0.79      0.87      1000

    accuracy                           0.88      2000
   macro avg       0.89      0.88      0.88      2000
weighted avg       0.89      0.88      0.88      2000



In [31]:
clf12=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,3))),
    ('RandomForest',RandomForestClassifier())
])
clf12.fit(X_train,y_train)

In [32]:
y_pred12=clf12.predict(X_test)
print(classification_report(y_test,y_pred12))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      1000
           1       0.95      0.99      0.97      1000

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000



In [33]:
clf12.predict(["A team of astronomers has announced the discovery of a new planet located in a distant galaxy that is made entirely of diamonds. This remarkable finding could change the way we understand planetary formation and wealth distribution in the universe."])

array([0])

In [34]:
clf12.predict(["NASA scientists have confirmed the presence of water on Mars, a breakthrough that could pave the way for future human exploration. The discovery was made using data collected from the Mars Reconnaissance Orbiter."])

array([0])