In [3]:
import os
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [6]:
#getting the dataset

os.chdir('Your default directory where you have the unzipped bbc folder')
type = []
news = []

for folder in os.listdir():
    text_files = os.listdir(folder)
    for file in text_files:
        filepath = folder+'/'+file
        data = open(filepath,"r")
        news.append(data.read())
        type.append(folder)

combined_data = {'category': type, 'news': news}

dataset = pd.DataFrame(combined_data)
dataset.to_csv('../dataset.csv', index=False )

In [9]:
#Encoding the target labels
df = pd.read_csv("../dataset.csv")

label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
df['category']= label_encoder.fit_transform(df['category']) 
#df= df.drop("category",axis =1)

print(df['category'].value_counts())


3    511
0    510
2    417
4    401
1    386
Name: category, dtype: int64


In [10]:
#lemmatization

lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text)]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

df["text_lemmatized"] = df["news"].apply(lemmatize_text)

df.head()

Unnamed: 0,category,news,text_lemmatized
0,0,Ad sales boost Time Warner profit\n\nQuarterly...,Ad sale boost Time Warner profit Quarterly pro...
1,0,Dollar gains on Greenspan speech\n\nThe dollar...,Dollar gain on Greenspan speech The dollar ha ...
2,0,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos unit buyer face loan claim The owner of ...
3,0,High fuel prices hit BA's profits\n\nBritish A...,High fuel price hit BA 's profit British Airwa...
4,0,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod takeover talk lift Domecq Shares in UK ...


In [11]:
#Removing stop words
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    filtered_words = [word for word in nltk.word_tokenize(text) if word not in stopword_list]
    filtered_text = ' '.join(filtered_words)    
    return filtered_text

df["filtered_text"] = df["text_lemmatized"].apply(remove_stopwords)

df.head()

Unnamed: 0,category,news,text_lemmatized,filtered_text
0,0,Ad sales boost Time Warner profit\n\nQuarterly...,Ad sale boost Time Warner profit Quarterly pro...,Ad sale boost Time Warner profit Quarterly pro...
1,0,Dollar gains on Greenspan speech\n\nThe dollar...,Dollar gain on Greenspan speech The dollar ha ...,Dollar gain Greenspan speech The dollar ha hit...
2,0,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos unit buyer face loan claim The owner of ...,Yukos unit buyer face loan claim The owner emb...
3,0,High fuel prices hit BA's profits\n\nBritish A...,High fuel price hit BA 's profit British Airwa...,High fuel price hit BA 's profit British Airwa...
4,0,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod takeover talk lift Domecq Shares in UK ...,Pernod takeover talk lift Domecq Shares UK dri...


In [12]:
training_set, test_set, training_labels, test_labels = train_test_split(df["filtered_text"], df["category"], test_size=0.3, random_state=10)


In [13]:
#Bag of words
vectorizer = CountVectorizer(min_df=2)
bow_train_features = vectorizer.fit_transform(training_set)

bow_test_features = vectorizer.transform(test_set)

In [14]:
mnb = MultinomialNB()
mnb.fit(bow_train_features,training_labels)

predictions = mnb.predict(bow_test_features)

score = mnb.score(bow_test_features, test_labels)
print("Bag of words accuracy ",score)

Bag of words accuracy  0.9805389221556886


In [15]:
#Tfidf 
tfidvectorizer = TfidfVectorizer(min_df=6, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True)
tfid_train_features = tfidvectorizer.fit_transform(training_set)

tfid_test_features = tfidvectorizer.transform(test_set)

In [16]:
mnb = MultinomialNB()

mnb.fit(tfid_train_features,training_labels)

predictions = mnb.predict(tfid_test_features)

score = mnb.score(tfid_test_features, test_labels)
print("Tfidf accuracy ",score)

Tfidf accuracy  0.9745508982035929
