In [25]:
import pandas as pd
import spacy
nlp=spacy.load('en_core_web_lg')
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df=pd.read_json('news_dataset.json')
df

Unnamed: 0,text,category
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS
3,This Richard Sherman Interception Literally Sh...,SPORTS
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS
...,...,...
7495,Sex Offender Registries Are Not Really Keeping...,CRIME
7496,'Stockbroker's Bible' Just Told Oil Industry T...,BUSINESS
7497,Want to Change It? Scale It!,BUSINESS
7498,"How To Make A Billion Dollar Drug In 1961, new...",BUSINESS


In [3]:
df.category.value_counts()

CRIME       2500
SPORTS      2500
BUSINESS    2500
Name: category, dtype: int64

In [5]:
df['category_new']=df['category'].map({
    'CRIME':0,
    'SPORTS':1,
    'BUSINESS':2
})

In [6]:
df

Unnamed: 0,text,category,category_new
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,0
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,0
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1
3,This Richard Sherman Interception Literally Sh...,SPORTS,1
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,2
...,...,...,...
7495,Sex Offender Registries Are Not Really Keeping...,CRIME,0
7496,'Stockbroker's Bible' Just Told Oil Industry T...,BUSINESS,2
7497,Want to Change It? Scale It!,BUSINESS,2
7498,"How To Make A Billion Dollar Drug In 1961, new...",BUSINESS,2


In [7]:
df['vector']=df['text'].apply(lambda x: nlp(x).vector)

In [10]:
x_train,x_test,y_train,y_test=train_test_split(df['vector'].values,df['category_new'],test_size=0.2,random_state=42,stratify=df.category_new)

In [11]:
x_train_2d=np.stack(x_train)
x_test_2d=np.stack(x_test)

In [12]:
model=Pipeline([
    ('scaler',MinMaxScaler()),
    ('naive bayes',MultinomialNB())
])
model.fit(x_train_2d,y_train)
model.score(x_test_2d,y_test)

0.692

In [13]:
model1=Pipeline([
    ('scaler',MinMaxScaler()),
    ('knn',KNeighborsClassifier())
])
model1.fit(x_train_2d,y_train)
print(model1.score(x_test_2d,y_test))
y_pred1=model1.predict(x_test_2d)
print(classification_report(y_test,y_pred1))

0.752
              precision    recall  f1-score   support

           0       0.72      0.82      0.77       500
           1       0.73      0.71      0.72       500
           2       0.81      0.73      0.76       500

    accuracy                           0.75      1500
   macro avg       0.75      0.75      0.75      1500
weighted avg       0.75      0.75      0.75      1500



In [14]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [15]:
df['text_preprocessed']=df['text'].apply(preprocess)

In [16]:
df

Unnamed: 0,text,category,category_new,vector,text_preprocessed
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,0,"[-0.62528795, 0.30613044, -1.1252857, -1.52891...",Larry Nassar blame victim say victimize newly ...
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,0,"[0.07858125, -1.8231387, 0.43582, -0.15502499,...",woman Beats Cancer die fall horse
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1,"[-2.1691844, 1.8227255, -2.8935153, 0.37569532...",vegas taxpayer spend Record $ 750 million New ...
3,This Richard Sherman Interception Literally Sh...,SPORTS,1,"[-1.1437662, -1.0418652, 0.021148713, -0.85042...",Richard Sherman Interception literally shake W...
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,2,"[0.12745197, 0.47146398, -1.4749851, -0.184787...",7 thing totally kill Weed Legalization Buzz
...,...,...,...,...,...
7495,Sex Offender Registries Are Not Really Keeping...,CRIME,0,"[-2.4190354, 1.174435, -2.1933095, 1.2168986, ...",sex offender registry keep child safe problem ...
7496,'Stockbroker's Bible' Just Told Oil Industry T...,BUSINESS,2,"[0.39541775, 1.2807539, -0.03717802, 0.2906668...",Stockbroker Bible tell Oil Industry accept dem...
7497,Want to Change It? Scale It!,BUSINESS,2,"[2.0343125, 4.0026126, -4.4549384, -1.3082138,...",want change scale
7498,"How To Make A Billion Dollar Drug In 1961, new...",BUSINESS,2,"[-2.8487475, -1.0849923, -0.92757547, -1.17545...",billion Dollar Drug 1961 newspaper world run s...


In [17]:
df['vector_preprocessed']=df['text_preprocessed'].apply(lambda x: nlp(x).vector)

In [19]:
x_train,x_test,y_train,y_test=train_test_split(df['vector_preprocessed'].values,df['category_new'],test_size=0.2,random_state=42,stratify=df.category_new)

In [22]:
x_train_2d = np.stack(x_train)
x_test_2d =  np.stack(x_test)

In [23]:
model=Pipeline([
    ('scaler',MinMaxScaler()),
    ('naive bayes',MultinomialNB())
])
model.fit(x_train_2d,y_train)
model.score(x_test_2d,y_test)

0.8166666666666667

In [24]:
model1=Pipeline([
    ('scaler',MinMaxScaler()),
    ('knn',KNeighborsClassifier())
])
model1.fit(x_train_2d,y_train)
print(model1.score(x_test_2d,y_test))
y_pred1=model1.predict(x_test_2d)
print(classification_report(y_test,y_pred1))

0.8653333333333333
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       500
           1       0.90      0.83      0.86       500
           2       0.88      0.86      0.87       500

    accuracy                           0.87      1500
   macro avg       0.87      0.87      0.87      1500
weighted avg       0.87      0.87      0.87      1500



In [26]:
model2=Pipeline([
    ('scaler',MinMaxScaler()),
    ('gadient boosting',GradientBoostingClassifier())
])
model2.fit(x_train_2d,y_train)
print(model2.score(x_test_2d,y_test))
y_pred2=model2.predict(x_test_2d)
print(classification_report(y_test,y_pred2))

0.886
              precision    recall  f1-score   support

           0       0.89      0.88      0.89       500
           1       0.88      0.89      0.88       500
           2       0.89      0.89      0.89       500

    accuracy                           0.89      1500
   macro avg       0.89      0.89      0.89      1500
weighted avg       0.89      0.89      0.89      1500

