In [5]:
with open('training.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [7]:
train_x = []
train_y = []

lines = [line.strip() for line in text.split('\n')]
for line in lines[1:]:
    w = line.split('\t')
    if len(w) == 2:
        train_x.append(w[0].strip().lower())
        train_y.append(w[1].strip().lower())    


In [8]:
train_x[:10]

['calvin klein in2u eau de toilette  -  150 ml (for men)',
 'for the love of physics (paperback)',
 'nike fission deodorant spray  -  200 ml (for men)',
 'spoken english (with cd) 2nd edition (paperback)',
 'the c++ programming language 3 edition (paperback)',
 'sony cybershot dsc-w610 point & shoot (black)',
 'ibps bank probationary officers management trainees common written exam. 1st edition (paperback)',
 'tommy hilfiger analog watch  - for women (silver)',
 "dr. seuss's beginner book collection (boxed set)",
 'panasonic sdr-s15 camcorder (silver)']

In [9]:
train_y[:10]

['calvin klein',
 'physics',
 'nike-deodrant',
 'spoken english',
 'c programming',
 'sony cybershot',
 'written english',
 'tommy watch',
 'best-seller books',
 'camcorder']

In [10]:
split = 0.8
total = len(train_x)
train_split = int(total*split)
print('training data size : ', train_split)
print('testing data size : ', (total-train_split))

v_train_x = train_x[:train_split]
v_train_y = train_y[:train_split]

v_test_x = train_x[train_split:]
v_test_y = train_y[train_split:]


training data size :  88
testing data size :  23


# Model Selection & Performance

## K-Fold Cross validation


<b>Code Sample :</b>

<code>
from sklearn.model_selection import cross_val_score
print(len(set(v_train_y)))
accuracies = cross_val_score(estimator = nb_classifier, X=train_x, y=v_train_y, cv=4)
print('All accuracies ', accuracies)
print('Mean accuracies ', accuracies.mean())
print('Std accuracies ', accuracies.std())
</code>

## Grid Search

<b>Code Sample :</b>


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = MultinomialNB()
nb_classifier.fit(train_x, v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x)

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(v_test_y, y_predict)
#print(cm)

from sklearn.model_selection import cross_val_score
print(len(set(v_train_y)))
accuracies = cross_val_score(estimator = nb_classifier, X=train_x, y=v_train_y, cv=8)
print('All accuracies ', accuracies)
print('Mean accuracies ', accuracies.mean())
print('Std accuracies ', accuracies.std())


from sklearn.model_selection import GridSearchCV
parameters = [
    { 'alpha':[1.0, 0.0], 'fit_prior':[True, False] },
    { 'alpha':[3.0, 2.0, 1.0, 0.0], 'fit_prior':[True, False] }
]

grid_search = GridSearchCV(estimator=nb_classifier, param_grid=parameters, scoring='accuracy', cv=8, n_jobs=-1)
grid_search_result = grid_search.fit(train_x, v_train_y)

print('Best accuracy ', grid_search_result.best_score_)
print('Best params ', grid_search_result.best_params_)


Accuracy  0.304347826087
Actual ----------------- Predicted
sony cybershot ----------------- sony cybershot
chemistry ----------------- data structures algorithms
sony cybershot ----------------- sony cybershot
physics ----------------- written english
spoken english ----------------- written english
written english ----------------- written english
camcorder ----------------- dslr canon
c programming ----------------- written english
camcorder ----------------- dslr canon
camera ----------------- camera
physics ----------------- best-seller books
timex watch ----------------- tommy watch
chemistry ----------------- data structures algorithms
c programming ----------------- written english
mathematics ----------------- written english
camera ----------------- sony cybershot
sony cybershot ----------------- sony cybershot
titan watch ----------------- tommy watch
c programming ----------------- written english
spoken english ----------------- written english
best-seller books ----------



Best accuracy  0.795454545455
Best params  {'alpha': 1.0, 'fit_prior': False}


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizer = CountVectorizer()
train_x = vectorizer.fit_transform(v_train_x)

nb_classifier = MultinomialNB()
nb_classifier.fit(train_x, v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x)

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

Accuracy  0.391304347826
Actual ----------------- Predicted
sony cybershot ----------------- sony cybershot
chemistry ----------------- data structures algorithms
sony cybershot ----------------- sony cybershot
physics ----------------- written english
spoken english ----------------- written english
written english ----------------- written english
camcorder ----------------- sony cybershot
c programming ----------------- written english
camcorder ----------------- sony cybershot
camera ----------------- nike-deodrant
physics ----------------- physics
timex watch ----------------- tommy watch
chemistry ----------------- data structures algorithms
c programming ----------------- data structures algorithms
mathematics ----------------- best-seller books
camera ----------------- sony cybershot
sony cybershot ----------------- sony cybershot
titan watch ----------------- titan watch
c programming ----------------- data structures algorithms
spoken english ----------------- spoken english


# Naive Bayes Classifier


### Deriving Bayes Theorem from Conditional Probablity

P(A|B) = P(A &#8745; B)|P(B) ==> P(A &#8745; B) = P(A|B) * P(B)
P(B|A) = P(B &#8745; A)|P(A) ==> P(B &#8745; A) = P(B|A) * P(A)

P(A &#8745; B) = P(B &#8745; A)

So, P(A|B) * P(B) = P(B|A) * P(A)

P(B|A) = P(A|B) * P(B) / P(A)

<img src='./images/bayes_theorem.png'>
<img src='./images/bayes_theorem_1.png'>

https://en.wikipedia.org/wiki/Naive_Bayes_classifier

### Multinomial Naive Bayes

With a multinomial event model, samples (feature vectors) represent the frequencies with which certain events have been generated by a multinomial 
( p 1 , … , p n ) where pi is the probability that event i occurs (or K such multinomials in the multiclass case). A feature vector x = ( x 1 , … , x n ) is then a histogram, with xi counting the number of times event i was observed in a particular instance. <b>This is the event model typically used for document classification</b>.

the multinomial distribution is a generalization of the binomial distribution.
For example, it models the probability of counts for rolling a k-sided die n times. For n independent trials each of which leads to a success for exactly one of k categories, with each category having a given fixed success probability, the multinomial distribution gives the probability of any particular combination of numbers of successes for the various categories.

When k is 2 and n is 1, the multinomial distribution is the Bernoulli distribution. When k is 2 and n is bigger than 1, it is the binomial distribution. When n is 1, it is the categorical distribution.

### Gaussian Naive Bayes

When k is 2 and n is 1, the multinomial distribution is the Bernoulli distribution. When k is 2 and n is bigger than 1, it is the binomial distribution. When n is 1, it is the categorical distribution.


### Bernoullis Naive Bayes

In the multivariate Bernoulli event model, features are independent booleans (binary variables) describing inputs. Like the multinomial model, this model is popular for document classification tasks,[9] where binary term occurrence features are used rather than term frequencies


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = MultinomialNB()
nb_classifier.fit(train_x, v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x)

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(v_test_y, y_predict)
print(cm)

Accuracy  0.304347826087
Actual ----------------- Predicted
sony cybershot ----------------- sony cybershot
chemistry ----------------- data structures algorithms
sony cybershot ----------------- sony cybershot
physics ----------------- written english
spoken english ----------------- written english
written english ----------------- written english
camcorder ----------------- dslr canon
c programming ----------------- written english
camcorder ----------------- dslr canon
camera ----------------- camera
physics ----------------- best-seller books
timex watch ----------------- tommy watch
chemistry ----------------- data structures algorithms
c programming ----------------- written english
mathematics ----------------- written english
camera ----------------- sony cybershot
sony cybershot ----------------- sony cybershot
titan watch ----------------- tommy watch
c programming ----------------- written english
spoken english ----------------- written english
best-seller books ----------

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = GaussianNB()
nb_classifier.fit(train_x.toarray(), v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x.toarray())

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('input', '----------', 'Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_train_x[i], '----------', v_test_y[i], '-----------------', y_predict[i])

Accuracy  0.521739130435
input ---------- Actual ----------------- Predicted
calvin klein in2u eau de toilette  -  150 ml (for men) ---------- sony cybershot ----------------- camera
for the love of physics (paperback) ---------- chemistry ----------------- mathematics
nike fission deodorant spray  -  200 ml (for men) ---------- sony cybershot ----------------- camera
spoken english (with cd) 2nd edition (paperback) ---------- physics ----------------- chemistry
the c++ programming language 3 edition (paperback) ---------- spoken english ----------------- spoken english
sony cybershot dsc-w610 point & shoot (black) ---------- written english ----------------- written english
ibps bank probationary officers management trainees common written exam. 1st edition (paperback) ---------- camcorder ----------------- camcorder
tommy hilfiger analog watch  - for women (silver) ---------- c programming ----------------- c programming
dr. seuss's beginner book collection (boxed set) ---------- cam

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = BernoulliNB()
nb_classifier.fit(train_x.toarray(), v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x.toarray())

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

Accuracy  0.217391304348
Actual ----------------- Predicted
sony cybershot ----------------- camera
chemistry ----------------- written english
sony cybershot ----------------- camera
physics ----------------- written english
spoken english ----------------- written english
written english ----------------- written english
camcorder ----------------- camera
c programming ----------------- written english
camcorder ----------------- camera
camera ----------------- camera
physics ----------------- written english
timex watch ----------------- tommy watch
chemistry ----------------- written english
c programming ----------------- written english
mathematics ----------------- written english
camera ----------------- camera
sony cybershot ----------------- camera
titan watch ----------------- tommy watch
c programming ----------------- written english
spoken english ----------------- written english
best-seller books ----------------- written english
written english ----------------- writte

# SVM Classifier



In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = LinearSVC(random_state=0)
nb_classifier.fit(train_x, v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x)

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

Accuracy  0.608695652174
Actual ----------------- Predicted
sony cybershot ----------------- camera
chemistry ----------------- mathematics
sony cybershot ----------------- camera
physics ----------------- chemistry
spoken english ----------------- spoken english
written english ----------------- written english
camcorder ----------------- camcorder
c programming ----------------- c programming
camcorder ----------------- camcorder
camera ----------------- camera
physics ----------------- physics
timex watch ----------------- timex watch
chemistry ----------------- data structures algorithms
c programming ----------------- data structures algorithms
mathematics ----------------- best-seller books
camera ----------------- sony cybershot
sony cybershot ----------------- sony cybershot
titan watch ----------------- titan watch
c programming ----------------- c programming
spoken english ----------------- spoken english
best-seller books ----------------- physics
written english ----------

# Decision Tree Classifier

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = DecisionTreeClassifier(random_state=0)
nb_classifier.fit(train_x, v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x)

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

Accuracy  0.739130434783
Actual ----------------- Predicted
sony cybershot ----------------- camera
chemistry ----------------- written english
sony cybershot ----------------- camera
physics ----------------- chemistry
spoken english ----------------- spoken english
written english ----------------- written english
camcorder ----------------- camcorder
c programming ----------------- c programming
camcorder ----------------- camcorder
camera ----------------- axe deo
physics ----------------- physics
timex watch ----------------- timex watch
chemistry ----------------- chemistry
c programming ----------------- c programming
mathematics ----------------- mathematics
camera ----------------- camera
sony cybershot ----------------- sony cybershot
titan watch ----------------- titan watch
c programming ----------------- c programming
spoken english ----------------- spoken english
best-seller books ----------------- written english
written english ----------------- written english
nike-de

# Random Forest Classifier

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = RandomForestClassifier(random_state=0)
nb_classifier.fit(train_x, v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x)

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

Accuracy  0.608695652174
Actual ----------------- Predicted
sony cybershot ----------------- camera
chemistry ----------------- data structures algorithms
sony cybershot ----------------- sony cybershot
physics ----------------- written english
spoken english ----------------- spoken english
written english ----------------- written english
camcorder ----------------- camera
c programming ----------------- c programming
camcorder ----------------- camera
camera ----------------- camera
physics ----------------- physics
timex watch ----------------- timex watch
chemistry ----------------- data structures algorithms
c programming ----------------- best-seller books
mathematics ----------------- mathematics
camera ----------------- sony cybershot
sony cybershot ----------------- sony cybershot
titan watch ----------------- titan watch
c programming ----------------- c programming
spoken english ----------------- spoken english
best-seller books ----------------- c programming
written engl

# Logistic Regression Classifier

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = LogisticRegression(random_state=0)
nb_classifier.fit(train_x, v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x)

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

Accuracy  0.565217391304
Actual ----------------- Predicted
sony cybershot ----------------- sony cybershot
chemistry ----------------- data structures algorithms
sony cybershot ----------------- sony cybershot
physics ----------------- written english
spoken english ----------------- spoken english
written english ----------------- written english
camcorder ----------------- camcorder
c programming ----------------- written english
camcorder ----------------- camcorder
camera ----------------- camera
physics ----------------- best-seller books
timex watch ----------------- timex watch
chemistry ----------------- data structures algorithms
c programming ----------------- written english
mathematics ----------------- best-seller books
camera ----------------- sony cybershot
sony cybershot ----------------- sony cybershot
titan watch ----------------- titan watch
c programming ----------------- data structures algorithms
spoken english ----------------- spoken english
best-seller books -

# K-NN Classifier

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(v_train_x)
nb_classifier = KNeighborsClassifier()
nb_classifier.fit(train_x, v_train_y)

test_x = vectorizer.transform(v_test_x)
y_predict = nb_classifier.predict(test_x)

score = metrics.accuracy_score(v_test_y, y_predict)
print('Accuracy ',score)

print('Actual', '-----------------', 'Predicted')
for i in range(len(v_test_y)):
    print(v_test_y[i], '-----------------', y_predict[i])

Accuracy  0.652173913043
Actual ----------------- Predicted
sony cybershot ----------------- sony cybershot
chemistry ----------------- data structures algorithms
sony cybershot ----------------- sony cybershot
physics ----------------- best-seller books
spoken english ----------------- spoken english
written english ----------------- written english
camcorder ----------------- camcorder
c programming ----------------- c programming
camcorder ----------------- camcorder
camera ----------------- camera
physics ----------------- best-seller books
timex watch ----------------- timex watch
chemistry ----------------- chemistry
c programming ----------------- best-seller books
mathematics ----------------- best-seller books
camera ----------------- sony cybershot
sony cybershot ----------------- sony cybershot
titan watch ----------------- titan watch
c programming ----------------- c programming
spoken english ----------------- spoken english
best-seller books ----------------- written eng

# Neural Net Classifier