In [30]:
#import files
import os
import string
import random
from collections import defaultdict
from nltk import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pickle

In [25]:
# some variables

BASE_DIR = '/Users/saharsh/Documents/Text Classification/bbc'
LABELS = ['business', 'entertainment', 'politics', 'sport', 'tech']


#some stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('said')
stop_words.add('mr')

In [3]:
#function to create data set

def create_data_set():
    with open('data.txt','w',encoding = 'utf8') as outfile:
        for label in LABELS:
            dir = '%s/%s' % (BASE_DIR , label)
            for filename in os.listdir(dir):
                fullfilename = '%s/%s' % (dir, filename)
                print(fullfilename)
                with open(fullfilename , 'rb') as file:
                    text = file.read().decode(errors='replace').replace('\n','')
                    outfile.write('%s\t%s\t%s\n' % (label,filename,text))

In [4]:
create_data_set()

/Users/saharsh/Documents/Text Classification/bbc/business/289.txt
/Users/saharsh/Documents/Text Classification/bbc/business/504.txt
/Users/saharsh/Documents/Text Classification/bbc/business/262.txt
/Users/saharsh/Documents/Text Classification/bbc/business/276.txt
/Users/saharsh/Documents/Text Classification/bbc/business/510.txt
/Users/saharsh/Documents/Text Classification/bbc/business/060.txt
/Users/saharsh/Documents/Text Classification/bbc/business/074.txt
/Users/saharsh/Documents/Text Classification/bbc/business/048.txt
/Users/saharsh/Documents/Text Classification/bbc/business/114.txt
/Users/saharsh/Documents/Text Classification/bbc/business/100.txt
/Users/saharsh/Documents/Text Classification/bbc/business/128.txt
/Users/saharsh/Documents/Text Classification/bbc/business/470.txt
/Users/saharsh/Documents/Text Classification/bbc/business/316.txt
/Users/saharsh/Documents/Text Classification/bbc/business/302.txt
/Users/saharsh/Documents/Text Classification/bbc/business/464.txt
/Users/sah

In [5]:
#funciton to setup document in an list like [ (lable,text) , (lable,text) , ... ]

def setup_docs():
    docs = []  #contianing (lable,text)
    with open('data.txt','r',encoding='utf8') as datafile:
        for row in datafile:
            part = row.split('\t')
            doc = (part[0],part[2].strip())
            docs.append(doc)
    return docs
    

In [7]:
"""
if __name__ == '__main__':
    create_data_set()
    docs = setup_docs()
    print('Done')
"""

"\nif __name__ == '__main__':\n    create_data_set()\n    docs = setup_docs()\n    print('Done')\n"

In [8]:
docs = setup_docs();

In [26]:
# most frequently occurs words for categories
##3
def get_tokens(text):
    # get individual words
    tokens = word_tokenize(text)
    #remove common words that are useless to us
    tokens = [ t for t in tokens if not t in stop_words]
    return tokens

##2
def clean_text(text):
    #remove punctuations
    text = text.translate(str.maketrans('','',string.punctuation))
    #convert into lower case
    text = text.lower()
    return text


##1
def print_frequent_dist(docs):
    tokens = defaultdict(list)

    #creating large list of commond word for each categories
    for doc in docs:
        doc_label = doc[0]
        doc_text = clean_text(doc[1])            ##2 
        # doc_tokens = word_tokenize(doc_text)
        doc_tokens = get_tokens(doc_text)        ##3
        tokens[doc_label].extend(doc_tokens)

    for category_labels, category_tokens in tokens.items():
        print(category_labels)
        fd = FreqDist(category_tokens)
        print(fd.most_common(20))




In [27]:
print_frequent_dist(docs)

business
[('us', 753), ('year', 571), ('would', 463), ('also', 439), ('new', 410), ('market', 400), ('growth', 363), ('company', 362), ('last', 356), ('economy', 327), ('firm', 313), ('could', 311), ('bank', 306), ('economic', 303), ('sales', 302), ('government', 294), ('oil', 287), ('2004', 282), ('years', 263), ('may', 246)]
entertainment
[('film', 698), ('best', 582), ('music', 413), ('also', 398), ('us', 348), ('one', 340), ('years', 326), ('new', 315), ('show', 287), ('first', 250), ('last', 248), ('awards', 246), ('year', 240), ('number', 227), ('award', 220), ('uk', 206), ('films', 203), ('two', 202), ('director', 201), ('tv', 197)]
politics
[('would', 1051), ('government', 635), ('labour', 587), ('people', 584), ('election', 517), ('blair', 495), ('party', 468), ('also', 450), ('new', 425), ('could', 384), ('minister', 381), ('told', 358), ('brown', 324), ('public', 314), ('plans', 289), ('howard', 286), ('uk', 286), ('one', 285), ('prime', 276), ('say', 266)]
sport
[('game', 4

In [40]:
# model training

#spliting data
def get_splits(docs):
    #scrumble docs
    random.shuffle(docs)

    # x --> document
    # y --> corresponding label
    x_train = []   
    y_train = []

    x_test = []
    y_test = []

    pivot = int(.80 * len(docs))

    for i in range(0,pivot):
        x_train.append(docs[i][1])
        y_train.append(docs[i][0])

    for i in range(pivot,len(docs)):
        x_test.append(docs[i][1])
        y_test.append(docs[i][0])

    return x_train,x_test,y_train,y_test


#evaluating Classifier
def evaluate_classifier(title, classifier, vectorizer , x_test, y_test):
    x_test_tfidf = vectorizer.transform(x_test)
    y_pred = classifier.predict(x_test_tfidf)

    precision = metrics.precision_score(y_test, y_pred, average='weighted')
    recall = metrics.recall_score(y_test, y_pred, average='weighted')
    f1 = metrics.f1_score(y_test, y_pred, average='weighted')

    print("%s\t%f\t%f\t%f\n" % (title,precision,recall,f1))


#traning
def train_classifier(docs):
    x_train,x_test,y_train,y_test = get_splits(docs)
    
    #vectorization of documents
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), min_df=3, analyzer='word')

    #creating doc-term matrix
    dtm = vectorizer.fit_transform(x_train)

    #train Naive bayes classifier
    naive_bayes_classifier = MultinomialNB().fit(dtm, y_train)

    evaluate_classifier("Naive Bayes\tTrain\t" , naive_bayes_classifier , vectorizer , x_train, y_train)
    evaluate_classifier("Naive Bayes\tTest \t" , naive_bayes_classifier , vectorizer , x_test , y_test ) 

    #store this classifier
    clf_filename = 'naive_bayes_classifier.pkl'
    pickle.dump(naive_bayes_classifier, open(clf_filename, 'wb'))

    #store vectorizer
    vec_filename = 'count_vectorizer.pkl'
    pickle.dump(vectorizer,open(vec_filename, 'wb'))


In [42]:
train_classifier(docs)

Naive Bayes	Train		0.996088	0.996067	0.996071

Naive Bayes	Test 		0.984537	0.984270	0.984289



In [44]:
## DEPLOYEMENT


new_data = "Google unveils cutting-edge camera technology, revolutionizing image capture with advanced AI capabilities. The innovative camera promises superior image quality, enhanced low-light performance, and seamless integration with Google's ecosystem. Tech enthusiasts anticipate a game-changing leap in photography and video experiences."



In [45]:
#Deployements functions

def classify(text):
    #loading classifier
    clf_filename = 'naive_bayes_classifier.pkl'
    nb_clf = pickle.load(open(clf_filename,'rb'))

    #vectorizer loading
    vec_filename = 'count_vectorizer.pkl'
    vectorizer = pickle.load(open(vec_filename,'rb'))

    pred = nb_clf.predict(vectorizer.transform([text]))

    print(pred[0])



In [46]:
classify(new_data)

tech


In [47]:
data2 = "Breakthrough in Biomedical Research: Scientists at the forefront of biomedical innovation announce a groundbreaking discovery, opening new possibilities for disease treatment and prevention. Their pioneering work showcases the potential to transform healthcare and improve lives through cutting-edge biomedical advancements"

classify(data2)

tech


In [49]:
data3  = "In a significant move, leaders engage in high-stakes negotiations to address pressing global issues. Tensions rise as policy debates unfold, shaping the geopolitical landscape. The political arena witnesses strategic maneuvers and diplomatic efforts as nations navigate complex challenges, impacting the course of international relations"

classify(data3)

politics


In [50]:
data4 = " Renowned designers introduce a sustainable revolution in the textile industry, unveiling cutting-edge fabrics that blend style with eco-consciousness. The latest collection showcases advancements in smart textiles, offering comfort, durability, and a chic aesthetic. Fashion enthusiasts are eager to embrace this eco-friendly trend in clothing"

classify(data4)

tech


In [51]:
data5 = "Adrenaline Soars in Riveting Athletic Showdown: Intense competition unfolds as teams clash in a thrilling event. Spectators witness remarkable displays of skill and determination. The outcome leaves fans captivated, sparking conversations about the electrifying performance in the dynamic arena of physical prowess."
classify(data5)

sport


In [53]:
data6 = "Cinematic Extravaganza Unfolds: The heart of Bollywood pulsates with vibrant storytelling and mesmerizing performances. A wave of cinematic brilliance sweeps through the industry as filmmakers push creative boundaries, delivering captivating narratives that resonate with audiences worldwide. Bollywood continues to be a powerhouse of entertainment, captivating hearts and minds."
classify(data6)

entertainment
