In [2]:
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Sklearn version: 0.19.1


# The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.


In [3]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [4]:
# Sample data
print(twenty_train.data[1])
print('---------------')
print('Target: ', twenty_train.target[0])


Hi,

	I have a problem, I hope some of the 'gurus' can help me solve.

	Background of the problem:
	I have a rectangular mesh in the uv domain, i.e  the mesh is a 
	mapping of a 3d Bezier patch into 2d. The area in this domain
	which is inside a trimming loop had to be rendered. The trimming
	loop is a set of 2d Bezier curve segments.
	For the sake of notation: the mesh is made up of cells.

	My problem is this :
	The trimming area has to be split up into individual smaller
	cells bounded by the trimming curve segments. If a cell
	is wholly inside the area...then it is output as a whole ,
	else it is trivially rejected. 

	Does any body know how thiss can be done, or is there any algo. 
	somewhere for doing this.

	Any help would be appreciated.

	Thanks, 
	Ani.
---------------
Target:  1


In [5]:
# Text preprocessing, tokenizing and filtering of stopwords

from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 5000)

In [6]:
print(X_train_counts[0,:])
print(X_train_counts[:,0])


  (0, 2866)	1
  (0, 238)	1
  (0, 4522)	1
  (0, 2058)	1
  (0, 1123)	1
  (0, 3867)	1
  (0, 1543)	1
  (0, 3385)	1
  (0, 2197)	1
  (0, 1094)	1
  (0, 2643)	1
  (0, 1865)	1
  (0, 2237)	1
  (0, 1795)	2
  (0, 4520)	1
  (0, 2251)	1
  (0, 1090)	1
  (0, 4744)	1
  (0, 3276)	1
  (0, 357)	1
  (0, 3273)	1
  (0, 4299)	1
  (0, 4869)	1
  (0, 2014)	1
  (0, 2550)	1
  (0, 1445)	1
  (232, 0)	2
  (272, 0)	1
  (282, 0)	1
  (400, 0)	1
  (433, 0)	2
  (581, 0)	2
  (588, 0)	1
  (766, 0)	1
  (768, 0)	2
  (837, 0)	3
  (844, 0)	1
  (859, 0)	1
  (880, 0)	1
  (1030, 0)	1
  (1056, 0)	6
  (1057, 0)	2
  (1263, 0)	1
  (1475, 0)	1
  (1665, 0)	16
  (1795, 0)	1
  (1802, 0)	1
  (1833, 0)	1
  (1890, 0)	2
  (2069, 0)	1
  (2144, 0)	1


In [7]:
#From occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = tfidf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 5000)

In [8]:
print(X_train_tf[0,:])
print(X_train_tf[:,0])


  (0, 1445)	0.09984961017366438
  (0, 2550)	0.09208756192014825
  (0, 2014)	0.10905059471954383
  (0, 4869)	0.11240915977454444
  (0, 4299)	0.17223237883101852
  (0, 3273)	0.1894979846175662
  (0, 357)	0.19614730458851817
  (0, 3276)	0.23935810161052842
  (0, 4744)	0.2426971720743241
  (0, 1090)	0.18536764690499014
  (0, 2251)	0.2815174602044267
  (0, 4520)	0.23935810161052842
  (0, 1795)	0.32667393651341364
  (0, 2237)	0.2178827886891891
  (0, 1865)	0.1823562906613318
  (0, 2643)	0.09443126584369482
  (0, 1094)	0.25039793047338804
  (0, 2197)	0.22599179670408007
  (0, 3385)	0.27295430367067186
  (0, 1543)	0.16378061599510793
  (0, 3867)	0.16560834723130236
  (0, 1123)	0.1576109272623592
  (0, 2058)	0.14480748228447815
  (0, 4522)	0.12653363760378736
  (0, 238)	0.17006982914503366
  (0, 2866)	0.1903802097234869
  (232, 0)	0.1626733015721039
  (272, 0)	0.03960458829975349
  (282, 0)	0.08301434712372262
  (400, 0)	0.005277364589626779
  (433, 0)	0.005964993735391251
  (581, 0)	0.15070465

## First basic model 

In [9]:
from sklearn.naive_bayes import MultinomialNB

# Define and fit in one line
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)


In [10]:
#Score test data

# Read test data
twenty_test = fetch_20newsgroups(subset='test',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

# Transform text to counts
X_test_counts = tf_vectorizer.transform(twenty_test.data)

# tf-idf transformation
X_test_tf = tfidf_transformer.transform(X_test_counts)

# Prediction
predicted = clf.predict(X_test_tf)

# Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy test: ', accuracy_score(twenty_test.target, predicted))


Accuracy test:  0.7989347536617842


In [13]:
# Score 2 new docs
docs_new = ['God loves GPU', 'OpenGL on the GPU is fast']

def score_text(text_list):
    '''
    Score function
    '''
    X_new_counts = tf_vectorizer.transform(docs_new)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf)
    return predicted


for doc, category in zip(docs_new, score_text(docs_new)):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God loves GPU' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [14]:
twenty_train.target_names


['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']