In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit

In [4]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score,precision_score,precision_recall_curve

In [28]:
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,LabelBinarizer

In [32]:
import os
import pickle

In [7]:
os.listdir('./data')

['preprocesseddata_40000.csv', 'preprocesseddata_10000.csv']

In [10]:
data=pd.read_csv('./data/preprocesseddata_40000.csv',sep='|')

In [11]:
data.head()

Unnamed: 0,text,topic
0,footbal canada artist grant produc ident footb...,sport
1,ucla stanford footbal stanford quarterback tan...,news
2,it s natur footbal fan m newcastl fan watch b...,sport
3,footbal area roundup oct friday night s hamme...,news
4,it s natur footbal fan m newcastl fan watch b...,sport


In [12]:
data['topic'].value_counts()

news             26004
business          2887
finance           2073
economics         1709
science           1496
tech              1399
sport             1248
entertainment      999
gaming             609
world              580
politics           320
energy             187
music              131
travel             127
beauty             126
food                94
opinion             11
Name: topic, dtype: int64

In [16]:
X=data['text']
y=data['topic']

In [15]:
ss=StratifiedShuffleSplit(n_splits=1,test_size=0.3)

In [23]:
for train_indexes,test_indexes in ss.split(X,y):
    train_x,train_y=data.loc[train_indexes,'text'],data.loc[train_indexes,'topic']
    test_x,test_y=data.loc[test_indexes,'text'],data.loc[test_indexes,'topic']

In [24]:
train_x.shape,train_y.shape

((28000,), (28000,))

In [25]:
tfidf=TfidfVectorizer(max_features=10000)
tfidf.fit(train_x,)
train_x=tfidf.transform(train_x)

In [42]:
pickle.dump(tfidf,open('./models/tfidfvectorizer.pkl','wb'))

In [26]:
train_x.shape

(28000, 10000)

In [29]:
le=LabelEncoder()
train_y_enc=le.fit_transform(train_y)

In [30]:
le.classes_

array(['beauty', 'business', 'economics', 'energy', 'entertainment',
       'finance', 'food', 'gaming', 'music', 'news', 'opinion',
       'politics', 'science', 'sport', 'tech', 'travel', 'world'],
      dtype=object)

In [34]:
mydict=dict()
for idx,item in enumerate(le.classes_):
    mydict[idx]=item
pickle.dump(mydict,open('classmapper.pkl','wb'))

In [35]:
mydict

{0: 'beauty',
 1: 'business',
 2: 'economics',
 3: 'energy',
 4: 'entertainment',
 5: 'finance',
 6: 'food',
 7: 'gaming',
 8: 'music',
 9: 'news',
 10: 'opinion',
 11: 'politics',
 12: 'science',
 13: 'sport',
 14: 'tech',
 15: 'travel',
 16: 'world'}

In [43]:
pickle.dump(le,open('./models/labelencoder.pkl','wb'))

In [36]:
multinb=MultinomialNB()

In [37]:
multinb.fit(train_x,train_y_enc)

MultinomialNB()

In [40]:
multinb.score(train_x,train_y_enc)

0.6908571428571428

In [45]:
pickle.dump(multinb,open('./models/multinbmodel.pkl','wb'))

In [47]:
import nltk
import json
import pickle
from nltk.stem import PorterStemmer
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
import sys
import re

In [63]:
def text_preprocessor(text):
    stemmer=PorterStemmer()
    sents=' '.join([sent for sent in sent_tokenize(text)])
    words=[word for word in word_tokenize(sents) if not word in stopwords.words('english')]
    words=[word for word in words if len(word)>1]
    #words=[word for word in words if not word in punctuation]
    words=[re.sub('[^a-zA-Z]+', '',word) for word in words]
    #tagged_words=[nltk.pos_tag(word) for word in words]
    words=' '.join([stemmer.stem(word) for word in words])
    vectroizer=pickle.load(open('./models/tfidfvectorizer.pkl','rb'))
    transformed=vectroizer.transform([words])
    return transformed
        

In [64]:
mytext="While Elon Musk has traditionally shown his support for the cryptocurrency Dogecoin, his most recent tweet has some crypto supporters seeking out other currencies as the post seemed to resemble another coin mascot – a bunny.(_/)( •_•)/ > _— Elon Musk (@elonmusk) October 17, 2021 The nondescript tweet from Musk caused bunny-themed coins to jump, including Bunny Park, Pancake Bunny, Little Angry Bunny V2, and Rewards Bunny, all gaining in value, according to Benzinga.Currency BunnyRocket was up 88."

In [65]:
trans_text=text_preprocessor(mytext)

In [66]:
trans_text

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in Compressed Sparse Row format>

In [80]:
def predictor(transformed_text):
    model=pickle.load(open('./models/multinbmodel.pkl','rb'))
    classmapper=pickle.load(open('./models/classmapper.pkl','rb'))
    pred=model.predict(transformed_text)[0]
    #print(pred)
    classname=classmapper.get(pred)
    return classname

In [81]:
predictor(trans_text)

'news'

In [71]:
classmapper=pickle.load(open('./models/classmapper.pkl','rb'))

In [73]:
classmapper.get(0)

'beauty'