In [2]:
import sys

import nltk
import os
import random
from collections import Counter
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier, classify
 
stoplist = stopwords.words('english')


In [4]:
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' news')
    print ('Test set size = ' + str(len(test_set)) + ' news')
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
 

In [5]:
def init_lists(folder):
    a_list = []
    file_list = os.listdir(folder)
    for a_file in file_list:
        f = open(folder + a_file, 'r')
        a_list.append(f.read())
    f.close()
    return a_list

In [6]:
def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence)]

def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}



In [8]:
def evaluate(train_set, test_set, classifier):
    print ('Accuracy on the training set = ' + str(classify.accuracy(classifier, train_set)))
    print ('Accuracy of the test set = ' + str(classify.accuracy(classifier, test_set)))
    classify.NaiveBayesClassifier
    
    
    
    
    classifier.show_most_informative_features(20)

In [9]:
enter = init_lists('C:\\Users\\pc\\AppData\\Roaming\\nltk_data\\corpora\\bbc\\entertainment\\')
politix = init_lists('C:\\Users\\pc\\AppData\\Roaming\\nltk_data\\corpora\\bbc\politics\\')
sport  = init_lists('C:\\Users\\pc\\AppData\\Roaming\\nltk_data\\corpora\\bbc\\sport\\')
tech = init_lists('C:\\Users\\pc\\AppData\\Roaming\\nltk_data\\corpora\\bbc\\tech\\')
t = init_lists('C:\\Users\\pc\\AppData\\Roaming\\nltk_data\\corpora\\bbc\\gen\\')





In [10]:
all_news = [(news, 'entertainment') for news in enter]
all_news += [(news, 'politics') for news in politix]
all_news += [(news, 'sport') for news in sport]
all_news += [(news, 'tech') for news in tech]


print ('Corpus size = ' + str(len(all_news)) + "news")

random.shuffle(all_news)



Corpus size = 1715news


In [11]:
all_features = [(get_features(news, 'bow'), label) for (news, label) in all_news]


In [12]:
train_set, test_set, classifier = train(all_features, 0.8)
evaluate(train_set, test_set, classifier)

Training set size = 1372 news
Test set size = 343 news
Accuracy on the training set = 0.9970845481049563
Accuracy of the test set = 0.9329446064139941
Most Informative Features
                  player = 1               sport : politi =     53.1 : 1.0
               secretary = 1              politi : tech   =     52.4 : 1.0
            conservative = 1              politi : tech   =     49.9 : 1.0
                    tony = 1              politi : sport  =     49.8 : 1.0
                 million = 1                tech : sport  =     49.7 : 1.0
                   prime = 1              politi : sport  =     47.6 : 1.0
                  market = 1                tech : sport  =     46.3 : 1.0
                 liberal = 1              politi : tech   =     46.1 : 1.0
                  shadow = 1              politi : sport  =     44.5 : 1.0
                     via = 1                tech : sport  =     43.8 : 1.0
              government = 1              politi : sport  =     42.9 : 1.

In [13]:

t[0]

'In recent times, we have seen a number of films changing their release dates. After Akshay Kumar agreed to postpone the release date of PadMan to ensure a solo release for Padmaavat, the R Balki directorial will finally hit screens on February 9. The month of February also has a few other interesting films for moviegoers like Aiyaary, Sonu Ke Titu Ki Sweety, Dil Juunglee and Welcome to New York. Here we list the Bollywood films you should to watch out for in February 2018.PadMan – February 9\nAkshay Kumar has been on a winning streak lately. His upcoming film PadMan looks like another hit for the actor. PadMan is inspired by the life of social entrepreneur Arunachalam Muruganantham, who found a way to manufacture affordable sanitary napkins for the women of his village. Directed by R Balki, PadMan also stars Radhika Apte and Sonam Kapoor.\nNeeraj Pandey’s Aiyaary is all set to clash with PadMan on February 9. Aiyaary revolves around a former military colonel (Manoj Bajpayee) who is on

In [14]:
test_list = preprocess(t[0])

In [15]:
test_list

['in',
 'recent',
 'time',
 ',',
 'we',
 'have',
 'seen',
 'a',
 'number',
 'of',
 'film',
 'changing',
 'their',
 'release',
 'date',
 '.',
 'after',
 'akshay',
 'kumar',
 'agreed',
 'to',
 'postpone',
 'the',
 'release',
 'date',
 'of',
 'padman',
 'to',
 'ensure',
 'a',
 'solo',
 'release',
 'for',
 'padmaavat',
 ',',
 'the',
 'r',
 'balki',
 'directorial',
 'will',
 'finally',
 'hit',
 'screen',
 'on',
 'february',
 '9',
 '.',
 'the',
 'month',
 'of',
 'february',
 'also',
 'ha',
 'a',
 'few',
 'other',
 'interesting',
 'film',
 'for',
 'moviegoer',
 'like',
 'aiyaary',
 ',',
 'sonu',
 'ke',
 'titu',
 'ki',
 'sweety',
 ',',
 'dil',
 'juunglee',
 'and',
 'welcome',
 'to',
 'new',
 'york',
 '.',
 'here',
 'we',
 'list',
 'the',
 'bollywood',
 'film',
 'you',
 'should',
 'to',
 'watch',
 'out',
 'for',
 'in',
 'february',
 '2018.padman',
 '–',
 'february',
 '9',
 'akshay',
 'kumar',
 'ha',
 'been',
 'on',
 'a',
 'winning',
 'streak',
 'lately',
 '.',
 'his',
 'upcoming',
 'film',
 'pa

In [17]:
def test_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(text).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}


In [19]:
feat = test_features(test_list,'bow')


In [20]:
feat

{'(': 2,
 ')': 2,
 ',': 11,
 '.': 13,
 '2011': 1,
 '2018.padman': 1,
 '9': 3,
 'actor': 1,
 'affordable': 1,
 'agreed': 1,
 'aiyaary': 3,
 'akshay': 2,
 'aleya': 1,
 'also': 4,
 'another': 1,
 'apte': 1,
 'around': 1,
 'arunachalam': 1,
 'aryan': 1,
 'audience': 1,
 'back': 1,
 'bajpayee': 1,
 'balki': 2,
 'barucha': 1,
 'bollywood': 1,
 'bring': 1,
 'bromance': 1,
 'changing': 1,
 'clash': 1,
 'colonel': 1,
 'conflict': 1,
 'counsellor': 1,
 'date': 2,
 'debut': 1,
 'dil': 2,
 'directed': 1,
 'director': 1,
 'directorial': 1,
 'english': 1,
 'ensure': 1,
 'entrepreneur': 1,
 'fall': 1,
 'favourite': 1,
 'feature': 1,
 'featuring': 1,
 'february': 5,
 'film': 6,
 'filmmaker': 1,
 'finally': 1,
 'former': 2,
 'found': 1,
 'fun': 1,
 'geeky': 1,
 'give': 1,
 'government': 1,
 'ha': 2,
 'hit': 2,
 'hunt': 1,
 'inspired': 1,
 'interest.ad': 1,
 'interesting': 1,
 'juunglee': 2,
 'ka': 1,
 'kapoor': 1,
 'karthik': 1,
 'ke': 2,
 'ki': 2,
 'kumar': 2,
 'lately': 1,
 'life': 1,
 'like': 3,
 'l

In [22]:
def testevaluate(train_set,test_feature):
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    return  classifier.classify(test_feature)



In [23]:
label = testevaluate(train_set,feat)


In [24]:
label

'entertainment'