In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import random
import string
import re
#tokenize and remove stopword
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 
from nltk.stem.snowball import SnowballStemmer

In [2]:
dataset = pd.read_csv('A:/PWR/project/SMA/dataset2.csv')
dataset.head()

Unnamed: 0,id,category,headline,short_description
0,1,business,"u.s. launches auto import probe, china vows to...",the investigation could lead to new u.s. tarif...
1,2,business,starbucks says anyone can now sit in its cafes...,the new policy was unveiled weeks after the co...
2,3,business,seattle passes controversial new tax on city's...,"following the council vote, amazons vice presi..."
3,4,business,uber ends forced arbitration in individual cas...,victims will be free to go to court -- but a f...
4,5,business,"chili's hit by data breach, credit and debit c...",the breach is believed to have occurred betwee...


### PREPROCESSING

In [3]:
#PREPROSESSING
def strip_links(text):
    link_regex    = re.compile("([^-_a-zA-Z0-9!@#%&=,/'\";:~`\$\^\*\(\)\+\[\]\.\{\}\|\?\<\>\\]+|[^\s]+)")
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def drop_digits(in_str):
    digit_list = "1234567890"
    for char in digit_list:
        in_str = in_str.replace(char, "")
    return in_str

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def stemmers(text):
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    # Return a list of words
    return(text)

In [4]:
headlines2=[]
for c2 in dataset['headline']:
    hl2=strip_all_entities(drop_digits(strip_links(c2)))
    headlines2.append(hl2)
    
short2=[]
for d2 in dataset['short_description']:
    sd2=stemmers(strip_all_entities(drop_digits(strip_links(d2))))
    short2.append(sd2)

In [5]:
preproses = pd.DataFrame(dict(category=dataset.category, headline=headlines2, short_desc=short2)) 
pd.set_option('max_colwidth', 1000)  
preproses.head()

Unnamed: 0,category,headline,short_desc
0,business,u s launches auto import probe china vows to defend its interests,the investig could lead to new u s tariff similar to those impos on import steel and aluminum in march
1,business,starbucks says anyone can now sit in its cafes even without buying anything,the new polici was unveil week after the controversi arrest of two black men at a philadelphia starbuck
2,business,seattle passes controversial new tax on city s biggest companies to combat housing crisis,follow the council vote amazon vice presid drew herden said the compani has resum construct plan for it so call block project in downtown seattl
3,business,uber ends forced arbitration in individual cases of sexual assault harassment,victim will be free to go to court but a few caveat remain
4,business,chili s hit by data breach credit and debit card information compromised,the breach is believ to have occur between march and april


### TF-IDF

In [6]:
news_vectorizer = TfidfVectorizer(stop_words='english')
news_matrix = news_vectorizer.fit_transform(preproses.short_desc)
print 'There are '+ str(news_matrix.get_shape()[0]) + ' documents over a vocabulary feature space of ' + str(news_matrix.get_shape()[1]) + ' terms.'

tfidf = pd.DataFrame(news_matrix.toarray(), index=[preproses.headline, preproses.category], columns= news_vectorizer.get_feature_names())
tfidf

There are 7350 documents over a vocabulary feature space of 10291 terms.


Unnamed: 0_level_0,Unnamed: 1_level_0,aaron,abandon,abba,abbey,abbi,abbott,abc,abdelbasit,abdic,abduct,...,zionist,zip,zombi,zone,zoo,zte,zucchini,zuckerberg,zuckerborg,zuzu
headline,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
u s launches auto import probe china vows to defend its interests,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
starbucks says anyone can now sit in its cafes even without buying anything,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
seattle passes controversial new tax on city s biggest companies to combat housing crisis,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
uber ends forced arbitration in individual cases of sexual assault harassment,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
chili s hit by data breach credit and debit card information compromised,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
how uber silences women after sexual assaults,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
how amazon is holding seattle hostage,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
bank of america appears to flip on firearm promise with loan to remington,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
ex volkswagen ceo charged in u s over emissions cheating scandal,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
women describe rampant groping sexual harassment at verizon contracted warehouse,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


### DATA TRAINING & TESTING

In [7]:
training_data=tfidf.iloc[:-350]
training_label=preproses.category.iloc[:-350]
testing_data=tfidf.iloc[-350:]
testing_label=preproses.category.iloc[-350:]
testing_title=preproses.headline.iloc[-350:]

### KNN

In [9]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=100, metric='cosine')

#Train the model using the training sets
knn.fit(training_data, training_label)

#Predict the response for test dataset
y_pred = knn.predict(testing_data)
#print(y_pred)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(testing_label, y_pred))

('Accuracy:', 0.5885714285714285)


In [10]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(testing_label, y_pred))
print(classification_report(testing_label, y_pred))

[[26  3  0  1 10  8  2]
 [ 4 35  3  1  4  1  2]
 [ 4  5 22  7  3  8  1]
 [ 1  1  1 45  0  1  1]
 [14  6  4  4 19  2  1]
 [ 4  3  4  2  3 29  5]
 [ 2  2  3  8  2  3 30]]
               precision    recall  f1-score   support

     business       0.47      0.52      0.50        50
        crime       0.64      0.70      0.67        50
entertainment       0.59      0.44      0.51        50
 food & drink       0.66      0.90      0.76        50
     politics       0.46      0.38      0.42        50
       sports       0.56      0.58      0.57        50
       travel       0.71      0.60      0.65        50

    micro avg       0.59      0.59      0.59       350
    macro avg       0.59      0.59      0.58       350
 weighted avg       0.59      0.59      0.58       350



### NBC

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(training_data, training_label)

predicted2 = clf.predict(testing_data)
#print(predicted)
np.mean(predicted2 == testing_label)

0.6142857142857143

In [12]:
print(confusion_matrix(testing_label, predicted2))
print(classification_report(testing_label, predicted2))

[[30  3  1  0  9  5  2]
 [ 2 31  0  1 10  4  2]
 [ 8  3 22  6  3  5  3]
 [ 0  1  0 45  0  0  4]
 [14  2  0  1 26  3  4]
 [ 5  4  4  1  5 29  2]
 [ 5  1  1  6  3  2 32]]
               precision    recall  f1-score   support

     business       0.47      0.60      0.53        50
        crime       0.69      0.62      0.65        50
entertainment       0.79      0.44      0.56        50
 food & drink       0.75      0.90      0.82        50
     politics       0.46      0.52      0.49        50
       sports       0.60      0.58      0.59        50
       travel       0.65      0.64      0.65        50

    micro avg       0.61      0.61      0.61       350
    macro avg       0.63      0.61      0.61       350
 weighted avg       0.63      0.61      0.61       350



In [13]:
from sklearn.naive_bayes import GaussianNB
clf2 = GaussianNB()
clf2.fit(training_data, training_label)

predicted3 = clf2.predict(testing_data)
#print(predicted)
np.mean(predicted3 == testing_label)

0.48