In [7]:
## Section 1 - Parts a and b: Classification using NB

In [8]:
import sys
import os
import numpy as np
import pandas as pd  
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB as NB
from sklearn import tree

# "Easy" Data
cat = ['comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware','rec.sport.baseball',
 'rec.sport.hockey']
news = fetch_20newsgroups(subset='train',
                          categories=cat,
                          remove=('headers', 'footers', 'quotes'))

 
term_freq = CountVectorizer (max_df=500, 
                      min_df=0,
                      max_features =10000, 
                      ngram_range =(1,1),
                     stop_words='english')
tf_matrix=term_freq.fit_transform(news.data[:500])   
print ("Dim: %d rows and %d columns " % (tf_matrix.shape[0], tf_matrix.shape[1]))

            
full_matrix = pd.DataFrame(tf_matrix.todense(),columns=term_freq.get_feature_names())
 

t=np.asarray(news.target[:500])    

xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50) 


clf = NB()
#from sklearn import tree
#clf = tree.DecisionTreeClassifier()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("\"Easy\" Data: Mislabeled Points: %d out of %d" % (error,xtest.shape[0] ))


# "Hard" Data
cat = ['sci.space', 'rec.motorcycles',
 'rec.autos']
news2 = fetch_20newsgroups(subset='train',
                          categories=cat,
                          remove=('headers', 'footers', 'quotes'))

 
term_freq = CountVectorizer (max_df=500, 
                      min_df=0,
                      max_features =10000, 
                      ngram_range =(1,1),
                     stop_words='english')
tf_matrix=term_freq.fit_transform(news2.data[:500])   
print ("Dim: %d rows and %d columns " % (tf_matrix.shape[0], tf_matrix.shape[1]))

              
full_matrix = pd.DataFrame(tf_matrix.todense(),columns=term_freq.get_feature_names())
 
t=np.asarray(news2.target[:500])    
 
xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50) 

 
clf = NB()
#from sklearn import tree
#clf = tree.DecisionTreeClassifier()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("\"Hard\" Data: Mislabeled Points: %d out of %d" % (error,xtest.shape[0] ))

 

Dim: 500 rows and 8021 columns 
"Easy" Data: Mislabeled Points: 36 out of 125
Dim: 500 rows and 10000 columns 
"Hard" Data: Mislabeled Points: 41 out of 125


In [9]:
## Section 1 - Part c: Classification using the Decison Tree Classifier

In [10]:
# "Easy" Data

term_freq = CountVectorizer (max_df=500, 
                      min_df=0,
                      max_features =10000, 
                      ngram_range =(1,1),
                     stop_words='english')
tf_matrix=term_freq.fit_transform(news.data[:500])   
print ("Dim: %d rows and %d columns " % (tf_matrix.shape[0], tf_matrix.shape[1]))
            
full_matrix = pd.DataFrame(tf_matrix.todense(),columns=term_freq.get_feature_names())
 
t=np.asarray(news.target[:500])    

xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50) 

clf = tree.DecisionTreeClassifier()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("\"Easy\" Data: Mislabeled Points: %d out of %d" % (error,xtest.shape[0] ))


# "Hard" Data
 
term_freq = CountVectorizer (max_df=500, 
                      min_df=0,
                      max_features =10000, 
                      ngram_range =(1,1),
                     stop_words='english')
tf_matrix=term_freq.fit_transform(news2.data[:500])   
print ("Dim: %d rows and %d columns " % (tf_matrix.shape[0], tf_matrix.shape[1]))

              
full_matrix = pd.DataFrame(tf_matrix.todense(),columns=term_freq.get_feature_names())
 
t=np.asarray(news2.target[:500])    
 
xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50) 

 
clf = tree.DecisionTreeClassifier()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("\"Hard\" Data: Mislabeled Points: %d out of %d" % (error,xtest.shape[0] ))

Dim: 500 rows and 8021 columns 
"Easy" Data: Mislabeled Points: 43 out of 125
Dim: 500 rows and 10000 columns 
"Hard" Data: Mislabeled Points: 54 out of 125


In [11]:
## Section 1 - Part d - Discussion: The NB classifier performed better than the Decision Tree classifier in cases 
## of "easy" and "hard" data

In [12]:
## Section 2 - Parts a,b, and c: Classification using Ensemble methods

In [13]:
# Using "Hard" data from above

from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import metrics

xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50)

bagging = BaggingClassifier (NB(), max_samples=.5, max_features=.5)
y_pred = bagging.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("With Bagging - Mislabeled Points: %d out of %d" % (error,xtest.shape[0] ))

 
clf = AdaBoostClassifier(NB(),
                         algorithm="SAMME",
                         n_estimators=300)

y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("With AdaBoost - Mislabeled Points: %d out of %d" % (error,xtest.shape[0]))


clf = RandomForestClassifier(n_estimators=100, max_depth=None,random_state=10, max_features='auto')
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("With Random Forest- Mislabeled Points: %d out of %d" % (error,xtest.shape[0]))

With Bagging - Mislabeled Points: 36 out of 125
With AdaBoost - Mislabeled Points: 41 out of 125
With Random Forest- Mislabeled Points: 39 out of 125


In [14]:
# k-folds with bagging
E1=[]
n_splits=10

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = BaggingClassifier (NB(), max_samples=.5, max_features=.5)
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E1.append(acc)
    
avg_accuracy = sum(E1)/n_splits
print (" Bagging: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))


E2 = []
n_splits=20

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = BaggingClassifier (NB(), max_samples=.5, max_features=.5)
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E2.append(acc)
    
avg_accuracy = sum(E2)/n_splits
print (" Bagging: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))

E3 = []
n_splits=30

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = BaggingClassifier (NB(), max_samples=.5, max_features=.5)
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E3.append(acc)
    
avg_accuracy = sum(E3)/n_splits
print (" Bagging: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))



 Bagging: Average Accuracy with 10 folds is 0.760
 Bagging: Average Accuracy with 20 folds is 0.772
 Bagging: Average Accuracy with 30 folds is 0.739


In [15]:
 # k-folds with Adaboost
E4 = []
n_splits=10

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = AdaBoostClassifier(NB(),
                         algorithm="SAMME",
                         n_estimators=300)
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E4.append(acc)
    
avg_accuracy = sum(E4)/n_splits
print (" Adaboost: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))


E5=[]
n_splits=20

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = AdaBoostClassifier(NB(),
                         algorithm="SAMME",
                         n_estimators=300)
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E5.append(acc)
    
avg_accuracy = sum(E5)/n_splits
print (" Adaboost: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))

E6=[]
n_splits=30

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = AdaBoostClassifier(NB(),
                         algorithm="SAMME",
                         n_estimators=300)
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E6.append(acc)
    
avg_accuracy = sum(E6)/n_splits
print (" Adaboost: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))


 Adaboost: Average Accuracy with 10 folds is 0.790
 Adaboost: Average Accuracy with 20 folds is 0.792
 Adaboost: Average Accuracy with 30 folds is 0.786


In [16]:
# k-folds with Randomforests
E7 = []
n_splits=10

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = RandomForestClassifier(n_estimators=100, max_depth=None,random_state=10, max_features='auto')
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E7.append(acc)
    
avg_accuracy = sum(E7)/n_splits
print (" Randomforest: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))


E8=[]
n_splits=20

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = RandomForestClassifier(n_estimators=100, max_depth=None,random_state=10, max_features='auto')
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E8.append(acc)
    
avg_accuracy = sum(E8)/n_splits
print (" Randomforest: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))

E9=[]
n_splits=30

kf = KFold(n_splits)
 
for train, test in kf.split(tf_matrix): 
    xtrain,xtest = tf_matrix[train],  tf_matrix[test]
    ytrain, ytest = t[train], t[test]
    clf = RandomForestClassifier(n_estimators=100, max_depth=None,random_state=10, max_features='auto')
    y = clf.fit(xtrain.toarray(), ytrain).predict(xtest.toarray())
    acc=metrics.accuracy_score(ytest, y)
    E9.append(acc)
    
avg_accuracy = sum(E9)/n_splits
print (" Randomforest: Average Accuracy with %d folds is %3.3f" % (n_splits,avg_accuracy))


 Randomforest: Average Accuracy with 10 folds is 0.746
 Randomforest: Average Accuracy with 20 folds is 0.764
 Randomforest: Average Accuracy with 30 folds is 0.766


Discussion: Adaboost gave the best performance (based on accuracy).  Changing the number of folds resulted in relatively stable performance.  The largest gain in accuracy was achieved by Bagging, whereas the accuracy of adaboost decreased when going from 10 to 30 folds (with a slight increase between 10 and 20).
    

In [17]:
# Section 3: Multi-label Classification   

In [18]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

# One-vs-all classification

news = fetch_20newsgroups(subset='train',
                          categories=('comp.graphics', 'rec.autos', 'talk.politics.guns', 'soc.religion.christian'),
                          remove=('headers', 'footers', 'quotes'))

#generate term frequency matrix
from sklearn.feature_extraction.text import CountVectorizer
tf_vec = CountVectorizer (max_df=500, 
                      min_df=0,
                      max_features =30000, 
                      ngram_range =(1,1),
                     stop_words='english')

tf_matrix=tf_vec.fit_transform(news.data)  #sparse matrix
print ("Dim: %d rows and %d columns  " % (tf_matrix.shape[0], tf_matrix.shape[1]))

full_matrix = pd.DataFrame(tf_matrix.todense(),columns=tf_vec.get_feature_names())


#One-vs-All (one-vs-Rest)
 
t=np.asarray(news.target)   # true labels
 
xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(),t,random_state=50) 


clf= LinearSVC(random_state=10)
#clf = NB()

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-All: SVC --> Mislabeled Points: %d out of %d" % (error,xtest.shape[0]))

 
clf = NB()

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-All: NB --> Mislabeled Points: %d out of %d" % (error,xtest.shape[0]))

clf = tree.DecisionTreeClassifier()

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-All: DecisionTree --> Mislabeled Points: %d out of %d" % (error,xtest.shape[0]))

 





Dim: 2323 rows and 29847 columns  
One-vs-All: SVC --> Mislabeled Points: 92 out of 581
One-vs-All: NB --> Mislabeled Points: 125 out of 581
One-vs-All: DecisionTree --> Mislabeled Points: 116 out of 581


In [19]:
# One-vs-One (All-vs-All)

from sklearn.multiclass import OneVsOneClassifier
clf= LinearSVC(random_state=10)
#clf = NB()

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-One: SVC --> Mislabeled Points: %d out of %d" % (error,xtest.shape[0]))
 
clf = NB()

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-One: NB --> Mislabeled Points: %d out of %d" % (error,xtest.shape[0]))

clf = tree.DecisionTreeClassifier()

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-One: DecisionTree --> Mislabeled Points: %d out of %d" % (error,xtest.shape[0]))

One-vs-One: SVC --> Mislabeled Points: 100 out of 581
One-vs-One: NB --> Mislabeled Points: 71 out of 581
One-vs-One: DecisionTree --> Mislabeled Points: 117 out of 581


Discussion:  SVC performs best with one-vs-all classification and NB performs best with one-vs-one. 

How does the classifier process the weights of the data points to focus on misclassified data points?  
The algorithm calculates the error the weak classifier made predicting the output variable for the training instance.  This turns out to be 0 if y from training set equals predicted y from the weak learner, and one otherwise.  This is then used in a weight adjustment formula.