In [2]:
# import newsGroups data
import sys
import os
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='train',
                          categories=('rec.autos',
                             'rec.sport.hockey'),
                          remove=('headers', 'footers', 'quotes'))

#generate term frequency matrix
from sklearn.feature_extraction.text import CountVectorizer
tf_vec = CountVectorizer (max_df=500, 
                      min_df=0,
                      max_features =10000, 
                      ngram_range =(1,1),
                     stop_words='english')
tf_matrix=tf_vec.fit_transform(news.data[:500])  #sparse matrix
print ("the data has %d rows and %d columns " % (tf_matrix.shape[0], tf_matrix.shape[1]))

import pandas as pd             #conver to full matrix
full_matrix = pd.DataFrame(tf_matrix.todense(),columns=tf_vec.get_feature_names())
#print (full[1:10]) 



#classify data
t=np.asarray(news.target[:500])   # true labels
from sklearn.model_selection import train_test_split
import numpy as np
xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50) 

from sklearn.naive_bayes import GaussianNB as NB
clf = NB()
#from sklearn import tree
#clf = tree.DecisionTreeClassifier()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of %d points: %d" % (xtest.shape[0],error ))



the data has 500 rows and 9241 columns 
number of mislabels out of 125 points: 14


# Evaluation metrics 

In [3]:
import sys
import numpy as np
from sklearn import metrics


print("accuracy  is %2.2f " % (metrics.accuracy_score(ytest, y_pred)))
print("precision is %2.2f " % (metrics.precision_score(ytest, y_pred)))
print("recall    is %2.2f " % (metrics.recall_score(ytest,y_pred)))
print("F1-score  is %2.2f " % (metrics.f1_score(ytest,y_pred)))
     

accuracy  is 0.89 
precision is 0.85 
recall    is 0.91 
F1-score  is 0.88 


# Build a Robust Model

*   Split data into training and test sets using the following techniques:
-
    - hold-out method 
    - cross validation
    - bootstrap

In [6]:
# Hold-out method

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics


E=[]
no_iter=10
for i in range (0,no_iter): 
   xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=20) 
   clf = NB()
   y = clf.fit(xtrain, ytrain).predict(xtest)
   acc=metrics.accuracy_score(ytest, y)
   E.append(acc)
   print (acc)
print ("Average Accuracy is %3.3f" % (sum(E)/no_iter) ) 

    

0.92
0.92
0.92
0.92
0.92
0.92
0.92
0.92
0.92
0.92
Average Accuracy is 0.920


In [7]:
# K-fold Cross Validation 

from sklearn.model_selection import KFold
import numpy as np
from sklearn import metrics
t=np.asarray(news.target[:500])   # true labels
kf = KFold(n_splits =10)
i=0
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = NB()
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    i=i+1

    print (" Accuracy of fold (%d) is %3.3f" % (i,acc ))  
    


 Accuracy of fold (1) is 0.920
 Accuracy of fold (2) is 0.940
 Accuracy of fold (3) is 0.980
 Accuracy of fold (4) is 0.840
 Accuracy of fold (5) is 0.940
 Accuracy of fold (6) is 0.940
 Accuracy of fold (7) is 0.960
 Accuracy of fold (8) is 0.880
 Accuracy of fold (9) is 0.860
 Accuracy of fold (10) is 0.880


# Model Enhancement
- Bagging
- Boosting
- Ensemble
    * Random Forest
    * AdaBost

In [49]:
# -------------  Naive Bayes Classifier  --------------#

# Bagging
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier 
from sklearn.naive_bayes import GaussianNB as NB
from sklearn import tree 

xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50)

bagging = BaggingClassifier (NB(), max_samples=.5, max_features=.5)
y_pred = bagging.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("With Bagging - number of mislabels out of %d points: %d" % (xtest.shape[0],error ))


# NO bagging
clf = NB()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("Witout Bagging - number of mislabels out of %d points: %d" % (xtest.shape[0],error ))


# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(NB(),
                         algorithm="SAMME",
                         n_estimators=200)

y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("With AdaBoost - number of mislabels out of %d points: %d" % (xtest.shape[0],error ))

With Bagging - number of mislabels out of 125 points: 9
Witout Bagging - number of mislabels out of 125 points: 14
With AdaBoost - number of mislabels out of 125 points: 14


In [73]:
# ------ DECISION TREES --------

from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree 


xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50)


# NO bagging
clf = tree.DecisionTreeClassifier()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("Witout Bagging - number of mislabels out of %d points: %d" % (xtest.shape[0],error ))


#Bagging
bagging = BaggingClassifier (tree.DecisionTreeClassifier(), )
y_pred = bagging.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("With Bagging - number of mislabels out of %d points: %d" % (xtest.shape[0],error ))

#random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=None,random_state=10, max_features='auto')
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("With Random Forest- number of mislabels out of %d points: %d" % (xtest.shape[0],error ))


# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=2),
                         algorithm="SAMME",
                         n_estimators=200)

y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("With AdaBoost - number of mislabels out of %d points: %d" % (xtest.shape[0],error ))

Witout Bagging - number of mislabels out of 125 points: 24
With Bagging - number of mislabels out of 125 points: 22
With Random Forest- number of mislabels out of 125 points: 13
With AdaBoost - number of mislabels out of 125 points: 17


# Multi-Class Classification
- One-vs-All
- One-vs-One

In [17]:
# Mutli-calss classification

# import newsGroups data
import sys
import os
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='train',
                          categories=('rec.autos',
                             'rec.sport.hockey',
                             'sci.med',
                             'sci.space'
                                     ),
                          remove=('headers', 'footers', 'quotes'))

#generate term frequency matrix
from sklearn.feature_extraction.text import CountVectorizer
tf_vec = CountVectorizer (max_df=500, 
                      min_df=0,
                      max_features =30000, 
                      ngram_range =(1,1),
                     stop_words='english')

tf_matrix=tf_vec.fit_transform(news.data)  #sparse matrix
print ("the data has %d rows and %d columns " % (tf_matrix.shape[0], tf_matrix.shape[1]))

import pandas as pd             #conver to full matrix
full_matrix = pd.DataFrame(tf_matrix.todense(),columns=tf_vec.get_feature_names())


#One-vs-All (one-vs-Rest)
import numpy as np
t=np.asarray(news.target)   # true labels
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50) 

from sklearn.naive_bayes import GaussianNB as NB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

clf= LinearSVC(random_state=10)
#clf = NB()

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-All --> number of mislabels out of %d points in the test test: %d" % (xtest.shape[0],error ))


# One-vs-One (All-vs-All)

from sklearn.multiclass import OneVsOneClassifier
clf= LinearSVC(random_state=10)
#clf = NB()

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-One --> number of mislabels out of %d points in the test test: %d" % (xtest.shape[0],error ))

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(xtrain)
error = (kmeans.labels_ != ytrain).sum()
print ("Kmeans --> number of mislabels out of %d points in the test test: %d" % (xtrain.shape[0],error ))


the data has 2381 rows and 29250 columns 
One-vs-All --> number of mislabels out of 596 points in the test test: 94
One-vs-One --> number of mislabels out of 596 points in the test test: 106
Kmeans --> number of mislabels out of 1785 points in the test test: 1346


# Unsupervised Learning (Clustering)


In [22]:

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(xtrain)
error = (kmeans.labels_ != ytrain).sum()
print ("Kmeans --> number of mislabels out of %d points in the data: %d" % (xtrain.shape[0],error ))



Kmeans --> number of mislabels out of 1785 points in the data: 1346
