In [322]:
import pandas as pd
import numpy as np
import time
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
#read the files form the folder
main_folder = "C:/Users/ruman/Downloads/Sci-Kit_Learning/7071/dataset_classification/bbc/"
file_data = pd.DataFrame(columns=['Category','File_Name','Data'])

category_list =[]
files_list =[]
data_list = []

for category in os.listdir(main_folder):
    print("\n",category)
    subfolder_path = os.path.join(main_folder,category)
    for files in os.listdir(subfolder_path):
        file_path = os.path.join(subfolder_path,files)
        
        #print(file_path)
        category_list.append(category)
        files_list.append(files)
        file_ptr = open(file_path)
        data = file_ptr.read().split('\n')
        data=list(filter(None, data))
        #data = data.split(' ')
        data_list.append(data)
        
file_data['Category'] = category_list
file_data['File_Name'] = files_list
file_data['Data']  = data_list
    


 business

 entertainment

 politics

 sport

 tech


In [3]:
file_data.head()

Unnamed: 0,Category,File_Name,Data
0,business,001.txt,"[Ad sales boost Time Warner profit, Quarterly ..."
1,business,002.txt,"[Dollar gains on Greenspan speech, The dollar ..."
2,business,003.txt,"[Yukos unit buyer faces loan claim, The owners..."
3,business,004.txt,"[High fuel prices hit BA's profits, British Ai..."
4,business,005.txt,"[Pernod takeover talk lifts Domecq, Shares in ..."


In [4]:
file_data.describe()

Unnamed: 0,Category,File_Name,Data
count,2225,2225,2225
unique,5,511,2127
top,sport,270.txt,"[Millions buy MP3 players in US, One in 10 adu..."
freq,511,5,2


In [5]:
file_data['Category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: Category, dtype: int64

In [6]:
file_data.Data[0][:1000]

['Ad sales boost Time Warner profit',
 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.',
 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.',
 "Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to

In [7]:
# 0 - business, 1 -entertainment, 2 - politics, 3 - sport, 4 - tech
label_encode = LabelEncoder()
file_data['Label'] = label_encode.fit_transform(file_data['Category'])
file_data.sample(5)

Unnamed: 0,Category,File_Name,Data,Label
342,business,343.txt,"[Air Jamaica back in state control, The Jamaic...",0
1175,politics,280.txt,"[Blair sees greater Bush consensus, George W B...",2
384,business,385.txt,"[Tate & Lyle boss bags top award, Tate & Lyle'...",0
2202,tech,379.txt,"[Apple laptop is 'greatest gadget', The Apple ...",4
1195,politics,300.txt,"[Howard denies split over ID cards, Michael Ho...",2


In [8]:
#convert data into an array
data_array = np.array(file_data['Data'])

In [9]:
stop_words = stopwords.words('english')
ps = PorterStemmer()

In [10]:
#convert into tokens, remove stop words and stem the tokens
tokenizer = RegexpTokenizer('[A-Za-z]\w+')
for idx in range(len(data_array)):   
    data_array[idx] = tokenizer.tokenize(str(data_array[idx]))
    
data_array = [[ps.stem(token) for token in doc if token not in stop_words] for doc in data_array]


In [11]:
file_data['Token_Data']=data_array
file_data.sample(10)
            

Unnamed: 0,Category,File_Name,Data,Label,Token_Data
560,entertainment,051.txt,"[Foxx and Swank win US awards, Jamie Foxx and ...",1,"[foxx, swank, win, US, award, jami, foxx, hila..."
665,entertainment,156.txt,"[Usher leads Soul Train shortlist, Chart-toppi...",1,"[usher, lead, soul, train, shortlist, chart, t..."
2085,tech,262.txt,"[Broadband steams ahead in the US, More and mo...",4,"[broadband, steam, ahead, US, more, american, ..."
431,business,432.txt,"[BA to suspend two Saudi services, British Air...",0,"[BA, suspend, two, saudi, servic, british, air..."
972,politics,077.txt,"[Brown names 16 March for Budget, Chancellor G...",2,"[brown, name, march, budget, chancellor, gordo..."
1408,sport,096.txt,"[Van Nistelrooy set to return, Manchester Unit...",3,"[van, nistelrooy, set, return, manchest, unit,..."
774,entertainment,265.txt,"[Abba reunite for musical premiere, The origin...",1,"[abba, reunit, music, premier, the, origin, st..."
700,entertainment,191.txt,"[Little Britain vies for TV trophy, BBC hits L...",1,"[littl, britain, vie, TV, trophi, bbc, hit, li..."
1198,politics,303.txt,"[Election deal faltered over Heath role, The T...",2,"[elect, deal, falter, heath, role, the, tori, ..."
1035,politics,140.txt,"[UK helps raped Rwandan women, Britain is to g...",2,"[UK, help, rape, rwandan, women, britain, give..."


In [12]:
#replace , with space in token list
file_data['Token_Data2'] = [ ' '.join(map(str,tok)) for tok in file_data['Token_Data']]

file_data.head(2)

Unnamed: 0,Category,File_Name,Data,Label,Token_Data,Token_Data2
0,business,001.txt,"[Ad sales boost Time Warner profit, Quarterly ...",0,"[Ad, sale, boost, time, warner, profit, quarte...",Ad sale boost time warner profit quarterli pro...
1,business,002.txt,"[Dollar gains on Greenspan speech, The dollar ...",0,"[dollar, gain, greenspan, speech, the, dollar,...",dollar gain greenspan speech the dollar hit hi...


## split data into test and training dataset

In [302]:
x_train,x_test,y_train,y_test = train_test_split(file_data['Token_Data2'].values,file_data['Label'].values)

In [303]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape


((1668,), (557,), (1668,), (557,))

In [304]:
x_train[0]

'can smith work scottish wonder the worst kept secret scottish footbal reveal thursday walter smith name new nation manag from moment berti vogt miser tenur charg scotland end former ranger everton boss overwhelm favourit post but smith man must one hardest job footbal the year old take time nation side doldrum scotland reach major final sinc world cup reach germani look near imposs pick two point open three game qualifi race and fifa rank see scotland list time low th like estonia ghana angola thailand scotland bless qualiti player experi top level smith get best meagr resourc smith track record make impress read wide respect within game the man alex ferguson assist scotland play world cup seven leagu titl ranger and appoint wide endors mani game top name includ ferguson graem souness took ibrox assist charact like souness ferguson current ibrox manag alex mcleish cite smith experi expans knowledg scottish game much made vogt inabl express player media that certainli case smith the fo

In [305]:
x_test.shape

(557,)

In [306]:
# Form tf-idf vector
vectorizer = TfidfVectorizer()

In [326]:
test_input = ["This is sports column"]
test_input = np.array(test_input)
x_train_vector = vectorizer.fit_transform(x_train)
x_test_vector =vectorizer.transform(x_test)
test_vector = vectorizer.transform(test_input)
pickle.dump(x_train, open("Training_data.npy", 'wb'))

In [309]:
x_train_vector.shape, x_test_vector.shape,test_vector.shape

((1668, 16925), (557, 16925), (1, 16925))

In [310]:
print(list(label_encode.classes_))

['business', 'entertainment', 'politics', 'sport', 'tech']


In [311]:
# Use multiple classifiers and grid search for prediction
def ML_modeling(models, params, X_train, X_test, y_train, y_test):    
    
    if not set(models.keys()).issubset(set(params.keys())):
        raise ValueError('Some estimators are missing parameters')

    for key in models.keys():
    
        model = models[key]
        param = params[key]
        gs = GridSearchCV(model, param, cv=10, error_score=0, refit=True)
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)
        
        # Print scores for the classifier
        print(key, ':', gs.best_params_)
        print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='macro'), recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro')))
    
    return gs

In [312]:
# reference : https://www.kaggle.com/rockystats/bbc-text-classification-word2vec-vs-tf-idf
# Preparing to make a pipeline 
models = {
    'Naive Bayes': MultinomialNB(), 
}

In [313]:
#params
params = {
    'Naive Bayes': { 'alpha': [0.5, 1], 'fit_prior': [True, False] }, 
}


In [314]:
trained_model_NB = ML_modeling(models, params, x_train_vector, x_test_vector, y_train, y_test)
## ML_modeling method also prints performance scores for each classifier

Naive Bayes : {'alpha': 0.5, 'fit_prior': True}
Accuracy: 0.989 	Precision: 0.989 	Recall: 0.989 		F1: 0.989



In [315]:
trained_model_NB.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=10, error_score=0, estimator=MultinomialNB(),
             param_grid={'alpha': [0.5, 1], 'fit_prior': [True, False]})>

In [316]:
# Test the subject classification model:
def vectorize(test_input):
    #stem and  stop words
    global vectorizer
    stop_words = stopwords.words('english')
    ps = PorterStemmer()
    
    #convert into tokens, remove stop words and stem the tokens
    tokenizer = RegexpTokenizer('[A-Za-z]\w+')
    test_input = tokenizer.tokenize(str(test_input))
    
    test_input = [ps.stem(token) for token in test_input if token not in stop_words]      
#     print(test_input)

    test_input =  [' '.join(map(str,test_input))]
    print(test_input)
    test_input = np.array(test_input)
#     print(test_input)
    test_vector = vectorizer.transform(test_input)
#     print(test_vector)    
    
    return test_vector


In [328]:

test_input = "hello World sensex"
# test_input= test_input.replace('"', ' ')
print(type(test_input))
print(test_input)

# test_input = np.array(test_input)
# print(test_input)
# test_vector = vectorizer.transform(test_input)
# print(test_vector)

<class 'str'>
hello World sensex


In [350]:
#predict the subject classifiction
test_vector= vectorize(test_input)
y_pred = trained_model_NB.predict(test_vector)
y_prob = trained_model_NB.predict_proba(test_vector)
y_pred,y_prob,y_prob[0][np.argmax(y_prob)]

['hello world sensex']


(array([3]),
 array([[0.23188525, 0.16243363, 0.16392107, 0.26346985, 0.17829021]]),
 0.2634698489399912)

In [321]:
# save the model to disk
filename = 'subject_Classification_NB.sav'
pickle.dump(trained_model_NB, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(test_vector)
print(label_encode.inverse_transform(result))

['sport']
