In [17]:
# import necessary libraries
import pandas as pd 
import numpy as np
import nltk
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
from textblob import TextBlob,Word
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import pickle

Dataset Link: https://www.kaggle.com/yufengdev/bbc-fulltext-and-category

In [2]:
# read the csv file 
df=pd.read_csv(r'C:\Users\MiMs\Desktop\fileorg-master\dataset\bbc-text.csv')
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [3]:
# to check null values 
df.isna().any()

category    False
text        False
dtype: bool

In [4]:
# label the string values at the target values
le=LabelEncoder()
le.fit(df['category'])
df['category']=le.transform(df['category'])
df

Unnamed: 0,category,text
0,4,tv future in the hands of viewers with home th...
1,0,worldcom boss left books alone former worldc...
2,3,tigers wary of farrell gamble leicester say ...
3,3,yeading face newcastle in fa cup premiership s...
4,1,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,0,cars pull down us retail figures us retail sal...
2221,2,kilroy unveils immigration policy ex-chatshow ...
2222,1,rem announce new glasgow concert us band rem h...
2223,2,how political squabbles snowball it s become c...


## label encoding 
business-0|
entertainment-1| 
politics-2|
sport-3|
tech-4

In [6]:
# count the stopwords
stop = stopwords.words('english')
def stop_word(df):
    df['stopwords_count']=df['text'].apply(lambda x : len([x for x in x.split() if x in stop]))
    print(df[['text','stopwords_count']].head())
stop_word(df)

                                                text  stopwords_count
0  tv future in the hands of viewers with home th...              325
1  worldcom boss  left books alone  former worldc...              108
2  tigers wary of farrell  gamble  leicester say ...              116
3  yeading face newcastle in fa cup premiership s...               74
4  ocean s twelve raids box office ocean s twelve...               81


In [7]:
# count the no of digits 
def digits(df):
    df['count_digits']=df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    print(df[['text','count_digits']].head())
digits(df)

                                                text  count_digits
0  tv future in the hands of viewers with home th...             3
1  worldcom boss  left books alone  former worldc...             3
2  tigers wary of farrell  gamble  leicester say ...             0
3  yeading face newcastle in fa cup premiership s...             0
4  ocean s twelve raids box office ocean s twelve...             1


In [8]:
# to count the no of uppercases
def upper_case(df):
    df['count_uppercase']=df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    print(df[['text','count_uppercase']].tail())
upper_case(df)
df['count_uppercase'].sum()

                                                   text  count_uppercase
2220  cars pull down us retail figures us retail sal...                0
2221  kilroy unveils immigration policy ex-chatshow ...                0
2222  rem announce new glasgow concert us band rem h...                0
2223  how political squabbles snowball it s become c...                0
2224  souness delight at euro progress boss graeme s...                0


0

In [9]:
# to remove punctuation from tweet
def punctuation_removal(df):
#     ^ : works like NOT operator
#     \w : Returns a match where the string contains any word characters 
#     \s: for whitespace
    df['text']=df['text'].str.replace('[^\w\s]','')
    print(df['text'].head())
punctuation_removal(df)

0    tv future in the hands of viewers with home th...
1    worldcom boss  left books alone  former worldc...
2    tigers wary of farrell  gamble  leicester say ...
3    yeading face newcastle in fa cup premiership s...
4    ocean s twelve raids box office ocean s twelve...
Name: text, dtype: object


In [10]:
# removal of stopwords 
def remove_stopwords(df):
    df['text']=df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    print(df['text'].head())
remove_stopwords(df)

0    tv future hands viewers home theatre systems p...
1    worldcom boss left books alone former worldcom...
2    tigers wary farrell gamble leicester say rushe...
3    yeading face newcastle fa cup premiership side...
4    ocean twelve raids box office ocean twelve cri...
Name: text, dtype: object


In [11]:
stop_word(df)

                                                text  stopwords_count
0  tv future hands viewers home theatre systems p...                0
1  worldcom boss left books alone former worldcom...                0
2  tigers wary farrell gamble leicester say rushe...                0
3  yeading face newcastle fa cup premiership side...                0
4  ocean twelve raids box office ocean twelve cri...                0


In [12]:
# to check the frequent words 
frequent = pd.Series(' '.join(df['text']).split()).value_counts()[:10]
frequent

said      7254
mr        3004
would     2577
also      2156
people    2043
new       1970
us        1924
year      1829
one       1763
could     1511
dtype: int64

In [13]:
# removal of frequent words 
frequent=list(frequent.index)
def remove_frequent_words(df):
    df['text']=df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in frequent))
    print(df['text'].head())
remove_frequent_words(df)

0    tv future hands viewers home theatre systems p...
1    worldcom boss left books alone former worldcom...
2    tigers wary farrell gamble leicester say rushe...
3    yeading face newcastle fa cup premiership side...
4    ocean twelve raids box office ocean twelve cri...
Name: text, dtype: object


In [14]:
# steamming
st= PorterStemmer()
def steamming(df):
    return df['text'][0:5].apply(lambda x: " ".join(st.stem(word) for word in x.split()))
steamming(df)

0    tv futur hand viewer home theatr system plasma...
1    worldcom boss left book alon former worldcom b...
2    tiger wari farrel gambl leicest say rush make ...
3    yead face newcastl fa cup premiership side new...
4    ocean twelv raid box offic ocean twelv crime c...
Name: text, dtype: object

In [19]:
# lemmatization
def lemmatization(df):
    df['text']=df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    print(df['text'].head())
lemmatization(df)

0    tv future hand viewer home theatre system plas...
1    worldcom bos left book alone former worldcom b...
2    tiger wary farrell gamble leicester say rushed...
3    yeading face newcastle fa cup premiership side...
4    ocean twelve raid box office ocean twelve crim...
Name: text, dtype: object


In [20]:
# splitting the dataset
x=df['text']
y=df['category']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3, random_state=0)

In [21]:
x_test

384     drive save festive holiday effort made protect...
1983    official respond court row australian tennis t...
985     cup holder man utd visit everton holder manche...
1386    adriano chelsea link rejected adriano agent gi...
1294    driscollgregan lead aid star ireland brian dri...
                              ...                        
2198    asian bank halt dollar slide dollar regained l...
1256    indy buy india paper irish publishing group in...
1637    china aviation seek rescue deal scandalhit jet...
2190    newry fight cup exit court newry city expected...
1581    howard truanted play snooker conservative lead...
Name: text, Length: 668, dtype: object

In [22]:
classi = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
classi.fit(x_train,y_train)
print(classi.predict(['kilroy unveils immigration policy ex-chatshow']))

[2]


In [23]:
# Save the Modle to file in the current working directory
with open('classification_model','wb') as file:
    pickle.dump(classi,file)

In [24]:
with open('classification_model','rb') as file:
    pickled_model=pickle.load(file)
pickled_model

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [25]:
pickled_model.predict(['tv future in the hands of viewers with home ','barcelona has scored a goal ','worldcom boss left books alone former'])

array([4, 3, 0])