In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = {
    'comp.graphics' : 'Graphics',
    'misc.forsale' : 'ForSale',
    'rec.autos' : 'Auto',
    'rec.motorcycles' : 'Motorcycle',
    'rec.sport.baseball' : 'Baseball',
    'rec.sport.hockey' : 'Hockey',
    'sci.electronics' : 'Electronics',
    'sci.med' : 'Medical',
    'sci.space' : 'Space',
    'talk.politics.guns' : 'Guns',
    'talk.politics.misc' : 'Politics'
} 

In [3]:
train_data = fetch_20newsgroups(categories=categories, subset='train')

In [4]:
test_data = fetch_20newsgroups(categories=categories, subset='test')

In [5]:
type(train_data)

sklearn.utils.Bunch

In [7]:
train_data.target

array([ 7, 10,  0, ...,  3,  2,  6], dtype=int64)

In [8]:
x_train = train_data['data']
y_train = train_data['target']

x_test = test_data['data']
y_test = test_data['target']

In [9]:
x_train[0]

"From: Daniel.Prince@f129.n102.z1.calcom.socal.com (Daniel Prince)\nSubject: Placebo effects\nLines: 17\n\nI know that the placebo effect is where a patient feels better or \neven gets better because of his/her belief in the medicine and \nthe doctor administering it.  Is there also an anti-placebo \neffect where the patient dislikes/distrusts doctors and medicine \nand therefore doesn't get better or feel better in spite of the \nmedicine?\n\nIs there an effect where the doctor believes so strongly in a \nmedicine that he/she sees improvement where the is none or sees \nmore improvement than there is?  If so, what is this effect \ncalled?  Is there a reverse of the above effect where the doctor \ndoesn't believe in a medicine and then sees less improvement than \nthere is?  What would this effect be called?  Have these effects \never been studied?  How common are these effects?  Thank you in \nadvance for all replies. \n\n... Information is very valuable but dis-information is MUCH mo

In [10]:
y_train[0]

7

In [11]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np



In [12]:
def text_processing(df):
    table = str.maketrans('', '', string.punctuation)
    for i in range(len(df)):
        df[i] = df[i].lower().translate(table)
    
    tokens = []
    for i in range(len(df)):
        tokens.append(word_tokenize(df[i]))
        
    eng_stopwords = stopwords.words('english')
    wordsList = []
    for tokenList in tokens:
        words = []
        for token in tokenList:
            if token not in eng_stopwords:
                words.append(token)
        wordsList.append(words)
        
    wnet = WordNetLemmatizer()
    for i in range(len(wordsList)):
        for j in range(len(wordsList[i])):
            wordsList[i][j] = wnet.lemmatize(wordsList[i][j], pos='v')
            
    for i in range(len(wordsList)):
        wordsList[i] = ' '.join(wordsList[i])
    
    return wordsList

In [13]:
x_train = text_processing(x_train)

In [14]:
x_train[0]

'danielprincef129n102z1calcomsocalcom daniel prince subject placebo effect line 17 know placebo effect patient feel better even get better hisher belief medicine doctor administer also antiplacebo effect patient dislikesdistrusts doctor medicine therefore doesnt get better feel better spite medicine effect doctor believe strongly medicine heshe see improvement none see improvement effect call reverse effect doctor doesnt believe medicine see less improvement would effect call effect ever study common effect thank advance reply information valuable disinformation much common'

In [15]:
tfidf = TfidfVectorizer()
x_train = tfidf.fit_transform(x_train).toarray()

In [16]:
x_train.shape

(6347, 73431)

In [17]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB()

In [19]:
x_test = text_processing(x_test)
x_test = tfidf.transform(x_test).toarray()
y_pred = nb.predict(x_test)

In [20]:
accuracy_score(y_test, y_pred)

0.8958826313298628

In [21]:
test_data = ["After launching a bunch of low-speed electric scooters, Joy E-Bike has launched Skyline (Rs 2.29 lakh), Hurricane (Rs 2.33 lakh), Thunderbolt (Rs 2.33 lakh) and Beast (Rs 2.42 lakh), making these the costliest EVs on sale in the country at the moment. The Skyline and the Thunderbolt are the brand’s fully faired offerings while the Beast and the Hurricane cater to the streetfighter enthusiasts. Just like with the Kabira KM 3000 and the KM 4000, these bikes look like blatant ripoffs of some popular ICE motorcycles."]

test_data = text_processing(test_data)
test_data = tfidf.transform(test_data).toarray()
nb.predict(test_data)

array([3], dtype=int64)