In [1]:
from sklearn.datasets import fetch_mldata
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
from sklearn.utils import shuffle
%matplotlib inline

# 20NG

In [4]:
def get_ng_fs(chisq=True):
    
    ng = fetch_20newsgroups(subset='all')
    ng_X = ng.data
    ng_y = ng.target
    
    del(ng)
    
    count_vect = CountVectorizer(stop_words="english",min_df=3,max_df=0.5)
    ng_X = count_vect.fit_transform(ng_X)
    tfidf_transformer = TfidfTransformer(use_idf=True)
    ng_X = tfidf_transformer.fit_transform(ng_X)
    
    
    ng_X_train,ng_X_test,ng_y_train,ng_y_test = train_test_split(ng_X,ng_y,test_size=0.25,random_state=42)
    

    if(chisq):
        mod = SelectKBest(chi2, k=200)
    else:
        mod = SelectKBest(mutual_info_classif, k=200)
    
    
    ng_X_train = mod.fit_transform(ng_X_train, ng_y_train)
    
    
    mgs = mod.get_support()
    at=[]
    for j in range(len(mgs)):
        if(mgs[j]):
            at.append(j)
    
    ng_X_test=ng_X_test.todense()
    ng_X_test=ng_X_test[:,at]

    return (ng_X_train,ng_y_train,ng_X_test,ng_y_test)

In [5]:
(ng_X_train,ng_y_train,ng_X_test,ng_y_test)=get_ng_fs(True)

In [8]:
logmodel = LogisticRegression(penalty='l2',verbose=1,solver = 'lbfgs',max_iter=100)
logmodel.fit(ng_X_train, ng_y_train)

print "Training accuracy: ",logmodel.score(ng_X_train,ng_y_train)
print "Testing accuracy: ",logmodel.score(ng_X_test,ng_y_test)

Training accuracy:  0.6709353332389981
Testing accuracy:  0.6419779286926995


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.9s finished


In [9]:
dtmodel = DecisionTreeClassifier()
dtmodel.fit(ng_X_train, ng_y_train)

print "Training accuracy: ",dtmodel.score(ng_X_train,ng_y_train)
print "Testing accuracy: ",dtmodel.score(ng_X_test,ng_y_test)

Training accuracy:  0.8896985991226829
Testing accuracy:  0.6025042444821732


In [10]:
del(ng_X_train)
del(ng_y_train)
del(ng_X_test)
del(ng_y_test)

In [11]:
(ng_X_train,ng_y_train,ng_X_test,ng_y_test)=get_ng_fs(False)

In [12]:
logmodel = LogisticRegression(penalty='l2',verbose=1,solver = 'lbfgs',max_iter=100)
logmodel.fit(ng_X_train, ng_y_train)

print "Training accuracy: ",logmodel.score(ng_X_train,ng_y_train)
print "Testing accuracy: ",logmodel.score(ng_X_test,ng_y_test)

Training accuracy:  0.40611291920192444
Testing accuracy:  0.38115449915110355


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.9s finished


In [13]:
dtmodel = DecisionTreeClassifier()
dtmodel.fit(ng_X_train, ng_y_train)

print "Training accuracy: ",dtmodel.score(ng_X_train,ng_y_train)
print "Testing accuracy: ",dtmodel.score(ng_X_test,ng_y_test)

Training accuracy:  0.9997877458610442
Testing accuracy:  0.2807724957555178


In [14]:
del(ng_X_train)
del(ng_y_train)
del(ng_X_test)
del(ng_y_test)