In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)




In [3]:
num_test = len(newsgroups_test.target)
test_data, test_labels = newsgroups_test.data[num_test/2:], newsgroups_test.target[num_test/2:]
dev_data, dev_labels = newsgroups_test.data[:num_test/2], newsgroups_test.target[:num_test/2]
train_data, train_labels = newsgroups_train.data, newsgroups_train.target

print 'training label shape:', train_labels.shape
print 'test label shape:', test_labels.shape
print 'dev label shape:', dev_labels.shape
print 'labels names:', newsgroups_train.target_names

training label shape: (2034,)
test label shape: (677,)
dev label shape: (676,)
labels names: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']


In [4]:
def P1(num_examples = 5):
    for i in range(num_examples):
        print "Training Example:\n", i
        print "\nLabel of Message:\n", newsgroups_train.target_names[train_labels[i]]
        print "\nText of Message:\n", train_data[i]
        print "\n- - - - -\n"
P1(5)

Training Example:
0

Label of Message:
comp.graphics

Text of Message:
Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

- - - - -

Training Example:
1

Label of Message:
talk.religion.misc

Text of Message:


Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 

In [18]:
def P2():
    print "a)"
    cv = CountVectorizer()
    cv_matrix = cv.fit_transform(train_data)
    print "Size of vocab : ",len(cv.get_feature_names())
    print "Average number of non-zero features", float((cv_matrix.nnz)/len(train_data))
    print "Fracion of non zero entries in the matrix ",float(cv_matrix.nnz)/(cv_matrix.shape[0] * cv_matrix.shape[1])
    print "b)"
    print "0th and last feature strings are ",cv.get_feature_names()[0],"and ",cv.get_feature_names()[-1]
    print "c)"
    ncv = CountVectorizer()
    ncv.fit(["atheism", "graphics", "space", "religion"])
    ncv_new_matrix =  ncv.transform(train_data)
    print "Shape of the training vector with vocab of 4 words",ncv_new_matrix.shape
    print "Average number of non-zero features per example:", float(ncv_new_matrix.nnz) / len(train_data)
    print "d)"
    cv = CountVectorizer(analyzer='char',ngram_range=(2,3))
    cv.fit(train_data)
    print "Size of vocabulary using analyzer = char",len(cv.get_feature_names())
    # e
    print "\ne."
    cv = CountVectorizer(min_df=10)
    cv.fit(train_data)
    print "Size of vocabulary with min_df = 10:", len(cv.get_feature_names())
    print "f"
    cv= CountVectorizer()
    cv.fit(train_data)
    train_vocab = set(cv.get_feature_names())
    cv.fit(dev_data)
    dev_vocab = set(cv.get_feature_names())
    print "Fraction of words in dev set missing from vocabulary:", float(len(dev_vocab - train_vocab)) / len(dev_vocab)
    
    
    
P2()

a)
Size of vocab :  26879
Average number of non-zero features 96.0
Fracion of non zero entries in the matrix  0.00359782722696
b)
0th and last feature strings are  00 and  zyxel
c)
Shape of the training vector with vocab of 4 words (2034, 4)
Average number of non-zero features per example: 0.268436578171
d)
Size of vocabulary using analyzer = char 35478

e.
Size of vocabulary with min_df = 10: 3064
f
Fraction of words in dev set missing from vocabulary: 0.247876400345


In [23]:
def P3():
    cv = CountVectorizer()
    cv.fit(train_data)
    
    train_data_matrix = cv.transform(train_data)
    dev_data_matrix = cv.transform(dev_data)
    
    
    ##KNearest Neighbours
    num = range(1,len(dev_data)+1)
    num_neighbours = {'n_neighbors':num}
    knn_clf =  KNeighborsClassifier()
    clf = GridSearchCV(knn_clf, num_neighbours,n_jobs=3)
    clf.fit(train_data_matrix,train_labels)
    print "Optimal k for K-Nearest Neighbours : ", clf.best_params_
    
    knn_clf = KNeighborsClassifier(n_neighbors=clf.best_params_['n_neighbors'])
    knn_clf.fit(train_data_matrix,train_labels)
    knn_clf_preds = knn_clf.predict(dev_data_matrix)
    print 'KNN f1 score for k = {0} is {1}'.format(clf.best_params_['n_neighbors'], metrics.f1_score(y_true = dev_labels, y_pred = knn_clf_preds, average="macro"))
    
    
    ##Multinomial NB
    ##Find optimal alpha 
    alphas = {'alpha':[0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
    mnb_clf = MultinomialNB()
    clf =  GridSearchCV(mnb_clf,alphas,n_jobs=3)
    clf.fit(train_data_matrix,train_labels)
    print "Optimal Value for MultinomialNB =", clf.best_params_
    
    #Calculate F1 score 
    mnb_clf = MultinomialNB(alpha=clf.best_params_)
    mnb_clf.fit(train_data_matrix,train_labels)
    mnb_clf_preds =  mnb_clf.predict(dev_data_matrix)
    print 'MultinomialNB f1 score for alpha = {0} is {1}'.format(clf.best_params_['alpha'], metrics.f1_score(y_true = dev_labels, y_pred = mnb_clf_preds, average="macro"))
    
    
    ### Logistic Regression
    # Logistic Regression
    listofCs = [0.0001, 0.001, 0.01, 0.1, 1, 2, 5, 10]
    Cs = {'C': listofCs}

    # Calculating optimal C for Logistic Regression
    logit_clf = LogisticRegression(penalty = 'l2')
    clf = GridSearchCV(logit_clf, Cs,n_jobs=3)
    clf.fit(train_data_matrix, train_labels)
    print '\nOptimal C for Logistic Regression =', clf.best_params_

    # Calculating F1 score for optimal C
    logit_clf = LogisticRegression(penalty = 'l2', C = clf.best_params_['C'])
    logit_clf.fit(train_data_matrix, train_labels)
    logit_clf_preds = logit_clf.predict(dev_data_matrix)
    print 'Logistic Regression f1 Score for C = {0} is {1}'.format(clf.best_params_['C'], metrics.f1_score(y_true = dev_labels, y_pred = logit_clf_preds, average="macro"))

    
    # Printing sum of squared weights for each class
    print " "
    for c in listofCs:
        logit_clf = LogisticRegression(penalty = 'l2', C = c)
        logit_clf.fit(train_data_matrix, train_labels)
        logit_clf_preds = logit_clf.predict(dev_data_matrix)
     
        print "C setting:", c, ";\tSum of Squared weights:\t", np.sum(logit_clf.coef_**2, axis=1)
P3()

Optimal k for K-Nearest Neighbours :  {'n_neighbors': 151}
KNN f1 score for k = 151 is 0.402814934441
Optimal Value for MultinomialNB = {'alpha': 0.01}


TypeError: unsupported operand type(s) for +: 'float' and 'dict'

In [30]:
def P4(cv,title):
    print title
    #cv= CountVectorizer()
    cv.fit(train_data)
    train_data_matrix =  cv.transform(train_data)
    dev_data_matrix =  cv.transform(dev_data)
    num_of_features =5
    feature_names =  np.array(cv.get_feature_names())
    
    logit_clf = LogisticRegression(penalty='l2',C=0.1)
    logit_clf.fit(train_data_matrix,train_labels)
    
    sortedIndices = np.argsort(np.absolute(logit_clf.coef_),axis=-1)
    print "> Top 5 features per class"
    for i in range(logit_clf.coef_.shape[0]):
        print newsgroups_train.target_names[i]
        for j in sortedIndices[i,-num_of_features:logit_clf.coef_.shape[1]]:
                print "\t ",feature_names[j]
    table = newsgroups_train.target_names
    table = np.hstack(('ROW/COL', table))
    
    print "> Table: 20 rows X 4 columns"
    for i in range(logit_clf.coef_.shape[0]):
        for j in sortedIndices[i, -num_of_features:logit_clf.coef_.shape[1]]:
            rounded_weights = np.round(logit_clf.coef_[:, j], 5)
            row = np.hstack((cv.get_feature_names()[j], rounded_weights))
            table = np.vstack((table, row))
    np.set_printoptions(linewidth=100, formatter={'all':lambda x: '{0}'.format(str(x).rjust(18))})
    print table
    print " "

                
# With default CountVectorizer()
cv = CountVectorizer()
P4(cv, ">> With default CountVectorizer")

# With bigram features
cv = CountVectorizer(ngram_range=(2, 2))
P4(cv, ">> With bigram features")

>> With default CountVectorizer
> Top 5 features per class
alt.atheism
	  atheists
	  bobby
	  religion
	  atheism
	  space
comp.graphics
	  computer
	  file
	  image
	  space
	  graphics
sci.space
	  god
	  nasa
	  orbit
	  graphics
	  space
talk.religion.misc
	  order
	  blood
	  christians
	  christian
	  space
> Table: 20 rows X 4 columns
[[           ROW/COL        alt.atheism      comp.graphics          sci.space talk.religion.misc]
 [          atheists            0.46157           -0.07942           -0.15839           -0.29528]
 [             bobby            0.47808            -0.1204           -0.16787           -0.22783]
 [          religion            0.49395           -0.29882           -0.39323            0.00391]
 [           atheism            0.49557           -0.20723           -0.19996           -0.26777]
 [             space           -0.65519           -0.71401            1.25869           -0.59025]
 [          computer           -0.03972            0.55892         

In [39]:
def empty_preprocessor(s):
    return s
def better_preprocessor(s):
    max_word_length = 6
    s = s.lower()
    s = re.sub('r\d{3,}'," numbers ",s)
    s= re.sub('r\W'," ",s)
    s= re.sub(r'_'," ",s)
    
    wrds = s.split()
    new_s = " "
    for wrd in wrds:
        if(len(wrd) >  max_word_length):
            wrd = wrd[:max_word_length]
        new_s = new_s + " "+wrd
    return new_s
def P5(title,t_data,t_labels,d_data,d_labels,pre_processor,stop_words):
    print "\n ", title
    cv = CountVectorizer(preprocessor=pre_processor,stop_words=stop_words)
    cv.fit(t_data)
    
    t_data_matrix =  cv.transform(t_data)
    d_data_matrix =  cv.transform(d_data)
    
    logit_clf =  LogisticRegression()
    logit_clf.fit(t_data_matrix,t_labels)
    
    dict_size =  len(cv.get_feature_names())
    print "\t Dictionary size :" , dict_size
    
    logit_clf_preds =  logit_clf.predict(d_data_matrix)
    f1= metrics.f1_score(y_true=d_labels,y_pred=logit_clf_preds,average =  'weighted')
    print '\tf1 Score', f1
    
    return (dict_size,f1)

### STUDENT END ###
title = ">> Without preprocessing"
stop_words = None
no_preprocess = P5(title, train_data, train_labels, dev_data, dev_labels, empty_preprocessor, stop_words)

title = ">> With preprocessing and without stop words"
stop_words = None
preprocess_no_stopwords = P5(title, train_data, train_labels, dev_data, dev_labels, better_preprocessor, stop_words)
print "\tReduction in dictionary size (from without preprocessing):", no_preprocess[0] - preprocess_no_stopwords[0]
print "\tImprovement in f1-score", preprocess_no_stopwords[1] - no_preprocess[1]

title = ">> With preprocessing and stop words"
stop_words = 'english'
preprocess_stopwords = P5(title, train_data, train_labels, dev_data, dev_labels, better_preprocessor, stop_words)
print "\tReduction in dictionary size (from without preprocessing):", no_preprocess[0] - preprocess_stopwords[0]
print "\tImprovement in f1-score", preprocess_stopwords[1] - no_preprocess[1]
    


  >> Without preprocessing
	 Dictionary size : 33291
	f1 Score 0.702334008756

  >> With preprocessing and without stop words
	 Dictionary size : 19781
	f1 Score 0.710583246469
	Reduction in dictionary size (from without preprocessing): 13510
	Improvement in f1-score 0.00824923771394

  >> With preprocessing and stop words
	 Dictionary size : 19554
	f1 Score 0.724932287752
	Reduction in dictionary size (from without preprocessing): 13737
	Improvement in f1-score 0.0225982789962
