In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import regex as re
import nltk
nltk.download('stopwords')

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [3]:
df_train = pd.read_csv('/content/BBC News Train.csv')

In [7]:
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [8]:
df_train.isnull().sum()

ArticleId    0
Text         0
Category     0
dtype: int64

In [10]:
df_train['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

**Defined a function for pre-processing**

In [4]:
def preprocessing(m):
  ps = PorterStemmer()
  corpus = []
  
  for i in range(0, len(m)):
    text = re.sub('^[a-zA-Z0-9]', ' ', m[i])
    text = text.lower()
    text = text.split()
  
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    corpus.append(text)
    
  return corpus

In [5]:
text = preprocessing(df_train['Text'])
le = LabelEncoder()
y = le.fit_transform(df_train['Category'])

**Train-Test-Split**

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(text, y, test_size = 0.3)

In [11]:
pipeline_dt = Pipeline([('TFIDF', TfidfVectorizer()),
                        ('Decision', DecisionTreeClassifier())])
pipeline_nb = Pipeline([('TFIDF', TfidfVectorizer()),
                        ('Naive', MultinomialNB())])
pipeline_svc = Pipeline([('TFIDF', TfidfVectorizer()),
                        ('Naive', SVC(kernel='linear'))])
pipelines = [pipeline_dt, pipeline_nb, pipeline_svc]
pipe_dict = {0 : 'decision', 1: 'Naive', 2: 'SVM'} 


In [15]:
# using train_test_split
for pipe in pipelines:
  pipe.fit(x_train,y_train)
  pred = pipe.predict(x_test)

**Displaying the best model from the Pipeline**

In [17]:
best_acc = 0.0
for i, model in enumerate(pipelines):
  if model.score(x_test,y_test) > best_acc:
    best_acc = model.score(x_test,y_test)
    best_model = pipe_dict[i]
    key = i
print(best_model, best_acc)

Naive 0.9753914988814317


**Cross Vaildation**

In [18]:
#using cross validation
from sklearn.model_selection import cross_val_score
for i, pipe in enumerate(pipelines):
  score=cross_val_score(pipe,x_train,y_train, cv=5)
  print(pipe_dict[i], score.mean())

decision 0.8024567537725431
Naive 0.9635535517114464
SVM 0.9712274567537726
