In [1]:
from sklearn.datasets import fetch_mldata
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
%matplotlib inline

# MNIST

In [2]:
def get_mnist_with_pca(ncomp):
    mnist = fetch_mldata('MNIST original', data_home="/Users/sasankauppu/Desktop/Data Mining CS6220/")
    mnist_X = mnist.data
    pca5 = PCA(n_components=ncomp)

    mn_X_train = mnist_X[:60000]

    pca5.fit(mn_X_train)
    mn_X_train = pca5.transform(mn_X_train)
    mn_y_train = mnist.target[:60000]

    mn_X_test = pca5.transform(mnist_X[-10000:])
    mn_y_test = mnist.target[-10000:]

    return(mn_X_train,mn_y_train,mn_X_test,mn_y_test)

In [3]:
(mn_X_train,mn_y_train,mn_X_test,mn_y_test) = get_mnist_with_pca(5)

In [4]:
logmodel = LogisticRegression(penalty='l2',verbose=1,solver = 'lbfgs',max_iter=100)
logmodel.fit(mn_X_train, mn_y_train)

print "Training accuracy: ",logmodel.score(mn_X_train,mn_y_train)
print "Testing accuracy: ",logmodel.score(mn_X_test,mn_y_test)

Training accuracy:  0.64365
Testing accuracy:  0.6529


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.0s finished


In [5]:
dtmodel = DecisionTreeClassifier(max_depth=32,min_samples_split=20,min_samples_leaf=10)
dtmodel.fit(mn_X_train,mn_y_train)

print "Training accuracy: ",dtmodel.score(mn_X_train,mn_y_train)
print "Testing accuracy: ",dtmodel.score(mn_X_test,mn_y_test)

Training accuracy:  0.8035666666666667
Testing accuracy:  0.7163


In [6]:
(mn_X_train,mn_y_train,mn_X_test,mn_y_test) = get_mnist_with_pca(20)

In [7]:
logmodel = LogisticRegression(penalty='l2',verbose=1,solver = 'lbfgs',max_iter=100)
logmodel.fit(mn_X_train, mn_y_train)

print "Training accuracy: ",logmodel.score(mn_X_train,mn_y_train)
print "Testing accuracy: ",logmodel.score(mn_X_test,mn_y_test)

Training accuracy:  0.8607166666666667
Testing accuracy:  0.8687


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.0s finished


In [8]:
dtmodel = DecisionTreeClassifier(max_depth=32,min_samples_split=20,min_samples_leaf=10)
dtmodel.fit(mn_X_train,mn_y_train)

print "Training accuracy: ",dtmodel.score(mn_X_train,mn_y_train)
print "Testing accuracy: ",dtmodel.score(mn_X_test,mn_y_test)

Training accuracy:  0.9024333333333333
Testing accuracy:  0.8439


In [9]:
mn_X_test=None
mn_X_train=None
mn_y_test=None
mn_y_train=None

# SPAMBASE

In [10]:
def load_spam_data():
    data = []

    f = open('/Users/sasankauppu/Desktop/Data Mining CS6220/DataMining/spambase/spambase.data')
    reader = csv.reader(f)
    
    for row in reader:
        data.append(row)
    f.close()
    
    X = np.array([x[:-1] for x in data]).astype(np.float)
    y = np.array([x[-1] for x in data]).astype(np.float)

    return X, y

In [11]:
def get_spam_with_pca(ncomp):
    spam_X,spam_y = load_spam_data()
    
    idx = np.random.permutation(len(spam_X))
    spam_X,spam_y = spam_X[idx], spam_y[idx]

    split = int(len(spam_X)*0.75)

    spam_X_train = spam_X[:split]
    
    pca5 = PCA(n_components=ncomp)
    pca5.fit(spam_X_train)
    
    spam_X_train = pca5.transform(spam_X_train)
    
    spam_y_train = spam_y[:split]
    spam_X_test = pca5.transform(spam_X[split:])
    spam_y_test = spam_y[split:]

    return(spam_X_train,spam_X_test,spam_y_train,spam_y_test)


In [12]:
(spam_X_train,spam_X_test,spam_y_train,spam_y_test) = get_spam_with_pca(5)

In [13]:
logmodel = LogisticRegression(penalty='l2',verbose=1,solver = 'lbfgs',max_iter=100)
logmodel.fit(spam_X_train, spam_y_train)

print "Training accuracy: ",logmodel.score(spam_X_train,spam_y_train)
print "Testing accuracy: ",logmodel.score(spam_X_test,spam_y_test)

Training accuracy:  0.7930434782608695
Testing accuracy:  0.792354474370113


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [14]:
dtmodel = DecisionTreeClassifier()
dtmodel.fit(spam_X_train,spam_y_train)

print "Training accuracy: ",dtmodel.score(spam_X_train,spam_y_train)
print "Testing accuracy: ",dtmodel.score(spam_X_test,spam_y_test)

Training accuracy:  0.9991304347826087
Testing accuracy:  0.8635968722849696


In [32]:
(spam_X_train,spam_X_test,spam_y_train,spam_y_test) = get_spam_with_pca(20)

In [33]:
logmodel = LogisticRegression(penalty='l2',verbose=1,solver = 'lbfgs',max_iter=100)
logmodel.fit(spam_X_train, spam_y_train)

print "Training accuracy: ",logmodel.score(spam_X_train,spam_y_train)
print "Testing accuracy: ",logmodel.score(spam_X_test,spam_y_test)

Training accuracy:  0.8994202898550725
Testing accuracy:  0.895742832319722


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [34]:
dtmodel = DecisionTreeClassifier()
dtmodel.fit(spam_X_train,spam_y_train)

print "Training accuracy: ",dtmodel.score(spam_X_train,spam_y_train)
print "Testing accuracy: ",dtmodel.score(spam_X_test,spam_y_test)

Training accuracy:  0.9994202898550725
Testing accuracy:  0.8774978279756733


In [35]:
(spam_X_train,spam_X_test,spam_y_train,spam_y_test) = get_spam_with_pca(10)

In [36]:
logmodel = LogisticRegression(penalty='l2',verbose=1,solver = 'lbfgs',max_iter=100)
logmodel.fit(spam_X_train, spam_y_train)

print "Training accuracy: ",logmodel.score(spam_X_train,spam_y_train)
print "Testing accuracy: ",logmodel.score(spam_X_test,spam_y_test)

Training accuracy:  0.8539130434782609
Testing accuracy:  0.8496959165942659


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [37]:
dtmodel = DecisionTreeClassifier()
dtmodel.fit(spam_X_train,spam_y_train)

print "Training accuracy: ",dtmodel.score(spam_X_train,spam_y_train)
print "Testing accuracy: ",dtmodel.score(spam_X_test,spam_y_test)

Training accuracy:  0.9994202898550725
Testing accuracy:  0.8688097306689835


In [63]:
del(spam_X_test)
del(spam_X_train)
del(spam_y_test)
del(spam_y_train)