# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer) in classification of text documents

### Import all the necessary libraries

In [48]:
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.metrics import accuracy_score

###  the entire 20 categories

In [11]:
#data = fetch_20newsgroups(subset='train')


In [13]:
# pprint(list(data.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [2]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [3]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


In [4]:
data=fetch_20newsgroups(subset='train')

In [5]:
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [44]:
data_y=fetch_20newsgroups(subset='test')

### Define a pipeline combining a text feature extractor with a simple classifier

In [50]:
pipeline=Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',SGDClassifier())
])

### Specify parameter grid
- 'vect__max_df': (0.5, 0.75, 1.0)
- 'vect__max_features': (None, 5000, 10000, 50000)
- 'vect__ngram_range': ((1, 1), (1, 2))
- 'tfidf__use_idf': (True, False)
- 'tfidf__norm': ('l1', 'l2')
- 'clf__max_iter': (20,)
- 'clf__alpha': (0.00001, 0.000001)
- 'clf__penalty': ('l2', 'elasticnet')
- 'clf__max_iter': (10, 50, 80)

In [51]:
parameters={
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'clf__max_iter': (20,)
}

### Find the best parameters for both the feature extraction and the classifier

### Build a GridSearch with the pipeline and parameter grid

In [52]:
grid=GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1,verbose=1)

### Start the grid search

In [53]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', SGDClassifier())]),
             n_jobs=-1,
             param_grid={'clf__max_iter': (20,),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

### Best Score

In [54]:
grid.best_score_

0.9295563153533196

In [55]:
y_pred=grid.predict(data_y.data)

In [56]:
accuracy_score(data_y.target,y_pred)

0.8588688263409453

### Best Parameter

In [18]:
grid.best_params_

{'clf__max_iter': 20, 'vect__max_df': 1.0, 'vect__ngram_range': (1, 2)}

### Choose the best model

In [14]:
grid.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(max_iter=20))])

### 2) Logistic regression


In [27]:
pipeline=Pipeline([
    ('vect',CountVectorizer()),
    ('lr',LogisticRegression())])

In [28]:
parameters={
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'lr__max_iter': (50,)
                   }

In [29]:
grid=GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1,verbose=1)

### Works like forever

In [30]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


KeyboardInterrupt: 

In [31]:
pipeline_lr=Pipeline([
    ('tfidf',TfidfTransformer()),
    ('lr',LogisticRegression())])

In [35]:
parameters={
    'lr__solver': ('newton-cg','sag','saga', 'lbfgs' ),
    'lr__max_iter': (80,)
                   }

In [36]:
grid=GridSearchCV(pipeline_lr,parameters,cv=5,n_jobs=-1,verbose=1)

In [43]:
grid.fit(data.data, data.target.reshape(-1,1))

Fitting 5 folds for each of 4 candidates, totalling 20 fits




ValueError: Expected 2D array, got 1D array instead:
array=["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 11\nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"
 'From: twillis@ec.ecn.purdue.edu (Thomas E Willis)\nSubject: PB questions...\nOrganization: Purdue University Engineering Computer Network\nDistribution: usa\nLines: 36\n\nwell folks, my mac plus finally gave up the ghost this weekend after\nstarting life as a 512k way back in 1985.  sooo, i\'m in the market for a\nnew machine a bit sooner than i intended to be...\n\ni\'m looking into picking up a powerbook 160 or maybe 180 and have a bunch\nof questions that (hopefully) somebody can answer:\n\n* does anybody know any dirt on when the next round of powerbook\nintroductions are expected?  i\'d heard the 185c was supposed to make an\nappearence "this summer" but haven\'t heard anymore on it - and since i\ndon\'t have access to macleak, i was wondering if anybody out there had\nmore info...\n\n* has anybody heard rumors about price drops to the powerbook line like the\nones the duo\'s just went through recently?\n\n* what\'s the impression of the display on the 180?  i could probably swing\na 180 if i got the 80Mb disk rather than the 120, but i don\'t really have\na feel for how much "better" the display is (yea, it looks great in the\nstore, but is that all "wow" or is it really that good?).  could i solicit\nsome opinions of people who use the 160 and 180 day-to-day on if its worth\ntaking the disk size and money hit to get the active display?  (i realize\nthis is a real subjective question, but i\'ve only played around with the\nmachines in a computer store breifly and figured the opinions of somebody\nwho actually uses the machine daily might prove helpful).\n\n* how well does hellcats perform?  ;)\n\nthanks a bunch in advance for any info - if you could email, i\'ll post a\nsummary (news reading time is at a premium with finals just around the\ncorner... :( )\n--\nTom Willis  \\  twillis@ecn.purdue.edu    \\    Purdue Electrical Engineering\n---------------------------------------------------------------------------\n"Convictions are more dangerous enemies of truth than lies."  - F. W.\nNietzsche\n'
 ...
 'From: westes@netcom.com (Will Estes)\nSubject: Mounting CPU Cooler in vertical case\nOrganization: Mail Group\nX-Newsreader: TIN [version 1.1 PL8]\nLines: 13\n\nI just installed a DX2-66 CPU in a clone motherboard, and tried mounting a CPU \ncooler on the chip.  After about 1/2 hour, the weight of the cooler was enough \nto dislodge the CPU from its mount.  It ended up bending a few pins\non the CPU, but luckily the power was not on yet.  I ended up\npressing the CPU deeply into its socket and then putting the CPU\ncooler back on.  So far so good.\n\nHave others had this problem?  How do you ensure that the weight of\nthe CPU fan and heatsink do not eventually work the CPU out of its\nsocket when mounting the motherboard in a vertical case?\n\n-- \nWill Estes\t\tInternet: westes@netcom.com\n'
 "From: steve@hcrlgw (Steven Collins)\nSubject: Re: Sphere from 4 points?\nOrganization: Central Research Lab. Hitachi, Ltd.\nLines: 27\nNntp-Posting-Host: hcrlgw\n\nIn article <1qkgbuINNs9n@shelley.u.washington.edu> bolson@carson.u.washington.edu (Edward Bolson) writes:\n>Boy, this will be embarassing if it is trivial or an FAQ:\n>\n>Given 4 points (non coplanar), how does one find the sphere, that is,\n>center and radius, exactly fitting those points?  I know how to do it\n>for a circle (from 3 points), but do not immediately see a \n>straightforward way to do it in 3-D.  I have checked some\n>geometry books, Graphics Gems, and Farin, but am still at a loss?\n>Please have mercy on me and provide the solution?  \n\nWouldn't this require a hyper-sphere.  In 3-space, 4 points over specifies\na sphere as far as I can see.  Unless that is you can prove that a point\nexists in 3-space that is equi-distant from the 4 points, and this may not\nnecessarily happen.\n\nCorrect me if I'm wrong (which I quite possibly am!)\n\nsteve\n---\n\n\n\n-- \n+---------------------------------------+--------------------------------+\n| Steven Collins\t\t\t| email: steve@crl.hitachi.co.jp |\n| Visiting Computer Graphics Researcher\t| phone: (0423)-23-1111 \t |\n| Hitachi Central Research Lab. Tokyo.\t| fax:   (0423)-27-7742\t\t |\n"
 "From: gunning@cco.caltech.edu (Kevin J. Gunning)\nSubject: stolen CBR900RR\nOrganization: California Institute of Technology, Pasadena\nLines: 12\nDistribution: usa\nNNTP-Posting-Host: alumni.caltech.edu\nSummary: see above\n\nStolen from Pasadena between 4:30 and 6:30 pm on 4/15.\n\nBlue and white Honda CBR900RR california plate KG CBR.   Serial number\nJH2SC281XPM100187, engine number 2101240.\n\nNo turn signals or mirrors, lights taped over for track riders session\nat Willow Springs tomorrow.  Guess I'll miss it.  :-(((\n\nHelp me find my baby!!!\n\nkjg\n\n"].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [57]:
pipeline_lr=Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('lr',LogisticRegression())
])

In [58]:
parameters={
    'vect__ngram_range': ((1, 1), (1, 2)),
    'lr__solver': ('newton-cg','sag','saga'),
    'lr__max_iter': (80,)
                   }

In [59]:
grid=GridSearchCV(pipeline_lr,parameters,cv=5,n_jobs=-1,verbose=1)

In [60]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('lr', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'lr__max_iter': (80,),
                         'lr__solver': ('newton-cg', 'sag', 'saga'),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [61]:
grid.best_score_

0.8953510378975509

In [62]:
y_pred=grid.predict(data_y.data)

In [63]:
accuracy_score(data_y.target,y_pred)

0.8207647371216145

In [53]:
grid.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('lr', LogisticRegression(max_iter=80, solver='saga'))])

### 3)Random forest Classifier

In [64]:
pipeline=Pipeline([
    ('vect',CountVectorizer()),
    ('rf',RandomForestClassifier())])

In [65]:
parameters={ 
    'rf__random_state': (42,),
    'rf__n_estimators' :(50, 70, 100)
                   }

In [66]:
grid=GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1,verbose=1)

In [67]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'rf__n_estimators': (50, 70, 100),
                         'rf__random_state': (42,)},
             verbose=1)

In [68]:
grid.best_score_

0.841170398518746

In [69]:
y_pred=grid.predict(data_y.data)

In [70]:
accuracy_score(data_y.target,y_pred)

0.7663303239511418

In [59]:
pipeline=Pipeline([
    ('tfidf',TfidfTransformer()),
    ('rf',RandomForestClassifier())])


In [60]:
parameters={ 
    'rf__random_state': (42,),
    'rf__n_estimators' :(50, 70, 100)
                   }

In [61]:
grid=GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1,verbose=1)

In [62]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




ValueError: Expected 2D array, got 1D array instead:
array=["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 11\nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"
 'From: twillis@ec.ecn.purdue.edu (Thomas E Willis)\nSubject: PB questions...\nOrganization: Purdue University Engineering Computer Network\nDistribution: usa\nLines: 36\n\nwell folks, my mac plus finally gave up the ghost this weekend after\nstarting life as a 512k way back in 1985.  sooo, i\'m in the market for a\nnew machine a bit sooner than i intended to be...\n\ni\'m looking into picking up a powerbook 160 or maybe 180 and have a bunch\nof questions that (hopefully) somebody can answer:\n\n* does anybody know any dirt on when the next round of powerbook\nintroductions are expected?  i\'d heard the 185c was supposed to make an\nappearence "this summer" but haven\'t heard anymore on it - and since i\ndon\'t have access to macleak, i was wondering if anybody out there had\nmore info...\n\n* has anybody heard rumors about price drops to the powerbook line like the\nones the duo\'s just went through recently?\n\n* what\'s the impression of the display on the 180?  i could probably swing\na 180 if i got the 80Mb disk rather than the 120, but i don\'t really have\na feel for how much "better" the display is (yea, it looks great in the\nstore, but is that all "wow" or is it really that good?).  could i solicit\nsome opinions of people who use the 160 and 180 day-to-day on if its worth\ntaking the disk size and money hit to get the active display?  (i realize\nthis is a real subjective question, but i\'ve only played around with the\nmachines in a computer store breifly and figured the opinions of somebody\nwho actually uses the machine daily might prove helpful).\n\n* how well does hellcats perform?  ;)\n\nthanks a bunch in advance for any info - if you could email, i\'ll post a\nsummary (news reading time is at a premium with finals just around the\ncorner... :( )\n--\nTom Willis  \\  twillis@ecn.purdue.edu    \\    Purdue Electrical Engineering\n---------------------------------------------------------------------------\n"Convictions are more dangerous enemies of truth than lies."  - F. W.\nNietzsche\n'
 ...
 'From: westes@netcom.com (Will Estes)\nSubject: Mounting CPU Cooler in vertical case\nOrganization: Mail Group\nX-Newsreader: TIN [version 1.1 PL8]\nLines: 13\n\nI just installed a DX2-66 CPU in a clone motherboard, and tried mounting a CPU \ncooler on the chip.  After about 1/2 hour, the weight of the cooler was enough \nto dislodge the CPU from its mount.  It ended up bending a few pins\non the CPU, but luckily the power was not on yet.  I ended up\npressing the CPU deeply into its socket and then putting the CPU\ncooler back on.  So far so good.\n\nHave others had this problem?  How do you ensure that the weight of\nthe CPU fan and heatsink do not eventually work the CPU out of its\nsocket when mounting the motherboard in a vertical case?\n\n-- \nWill Estes\t\tInternet: westes@netcom.com\n'
 "From: steve@hcrlgw (Steven Collins)\nSubject: Re: Sphere from 4 points?\nOrganization: Central Research Lab. Hitachi, Ltd.\nLines: 27\nNntp-Posting-Host: hcrlgw\n\nIn article <1qkgbuINNs9n@shelley.u.washington.edu> bolson@carson.u.washington.edu (Edward Bolson) writes:\n>Boy, this will be embarassing if it is trivial or an FAQ:\n>\n>Given 4 points (non coplanar), how does one find the sphere, that is,\n>center and radius, exactly fitting those points?  I know how to do it\n>for a circle (from 3 points), but do not immediately see a \n>straightforward way to do it in 3-D.  I have checked some\n>geometry books, Graphics Gems, and Farin, but am still at a loss?\n>Please have mercy on me and provide the solution?  \n\nWouldn't this require a hyper-sphere.  In 3-space, 4 points over specifies\na sphere as far as I can see.  Unless that is you can prove that a point\nexists in 3-space that is equi-distant from the 4 points, and this may not\nnecessarily happen.\n\nCorrect me if I'm wrong (which I quite possibly am!)\n\nsteve\n---\n\n\n\n-- \n+---------------------------------------+--------------------------------+\n| Steven Collins\t\t\t| email: steve@crl.hitachi.co.jp |\n| Visiting Computer Graphics Researcher\t| phone: (0423)-23-1111 \t |\n| Hitachi Central Research Lab. Tokyo.\t| fax:   (0423)-27-7742\t\t |\n"
 "From: gunning@cco.caltech.edu (Kevin J. Gunning)\nSubject: stolen CBR900RR\nOrganization: California Institute of Technology, Pasadena\nLines: 12\nDistribution: usa\nNNTP-Posting-Host: alumni.caltech.edu\nSummary: see above\n\nStolen from Pasadena between 4:30 and 6:30 pm on 4/15.\n\nBlue and white Honda CBR900RR california plate KG CBR.   Serial number\nJH2SC281XPM100187, engine number 2101240.\n\nNo turn signals or mirrors, lights taped over for track riders session\nat Willow Springs tomorrow.  Guess I'll miss it.  :-(((\n\nHelp me find my baby!!!\n\nkjg\n\n"].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [63]:
pipeline_lr=Pipeline([
    ('vect',CountVectorizer()),
    ('rf',RandomForestClassifier())
])

In [64]:
parameters={ 
    'rf__solver': ('auto', 'sqrt', 'log2')
    'rf__random_state': (42,),
    'rf__n_estimators' :(50, 70, 100)
                   }

In [65]:
grid=GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1,verbose=1)

In [66]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




ValueError: Expected 2D array, got 1D array instead:
array=["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 11\nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"
 'From: twillis@ec.ecn.purdue.edu (Thomas E Willis)\nSubject: PB questions...\nOrganization: Purdue University Engineering Computer Network\nDistribution: usa\nLines: 36\n\nwell folks, my mac plus finally gave up the ghost this weekend after\nstarting life as a 512k way back in 1985.  sooo, i\'m in the market for a\nnew machine a bit sooner than i intended to be...\n\ni\'m looking into picking up a powerbook 160 or maybe 180 and have a bunch\nof questions that (hopefully) somebody can answer:\n\n* does anybody know any dirt on when the next round of powerbook\nintroductions are expected?  i\'d heard the 185c was supposed to make an\nappearence "this summer" but haven\'t heard anymore on it - and since i\ndon\'t have access to macleak, i was wondering if anybody out there had\nmore info...\n\n* has anybody heard rumors about price drops to the powerbook line like the\nones the duo\'s just went through recently?\n\n* what\'s the impression of the display on the 180?  i could probably swing\na 180 if i got the 80Mb disk rather than the 120, but i don\'t really have\na feel for how much "better" the display is (yea, it looks great in the\nstore, but is that all "wow" or is it really that good?).  could i solicit\nsome opinions of people who use the 160 and 180 day-to-day on if its worth\ntaking the disk size and money hit to get the active display?  (i realize\nthis is a real subjective question, but i\'ve only played around with the\nmachines in a computer store breifly and figured the opinions of somebody\nwho actually uses the machine daily might prove helpful).\n\n* how well does hellcats perform?  ;)\n\nthanks a bunch in advance for any info - if you could email, i\'ll post a\nsummary (news reading time is at a premium with finals just around the\ncorner... :( )\n--\nTom Willis  \\  twillis@ecn.purdue.edu    \\    Purdue Electrical Engineering\n---------------------------------------------------------------------------\n"Convictions are more dangerous enemies of truth than lies."  - F. W.\nNietzsche\n'
 ...
 'From: westes@netcom.com (Will Estes)\nSubject: Mounting CPU Cooler in vertical case\nOrganization: Mail Group\nX-Newsreader: TIN [version 1.1 PL8]\nLines: 13\n\nI just installed a DX2-66 CPU in a clone motherboard, and tried mounting a CPU \ncooler on the chip.  After about 1/2 hour, the weight of the cooler was enough \nto dislodge the CPU from its mount.  It ended up bending a few pins\non the CPU, but luckily the power was not on yet.  I ended up\npressing the CPU deeply into its socket and then putting the CPU\ncooler back on.  So far so good.\n\nHave others had this problem?  How do you ensure that the weight of\nthe CPU fan and heatsink do not eventually work the CPU out of its\nsocket when mounting the motherboard in a vertical case?\n\n-- \nWill Estes\t\tInternet: westes@netcom.com\n'
 "From: steve@hcrlgw (Steven Collins)\nSubject: Re: Sphere from 4 points?\nOrganization: Central Research Lab. Hitachi, Ltd.\nLines: 27\nNntp-Posting-Host: hcrlgw\n\nIn article <1qkgbuINNs9n@shelley.u.washington.edu> bolson@carson.u.washington.edu (Edward Bolson) writes:\n>Boy, this will be embarassing if it is trivial or an FAQ:\n>\n>Given 4 points (non coplanar), how does one find the sphere, that is,\n>center and radius, exactly fitting those points?  I know how to do it\n>for a circle (from 3 points), but do not immediately see a \n>straightforward way to do it in 3-D.  I have checked some\n>geometry books, Graphics Gems, and Farin, but am still at a loss?\n>Please have mercy on me and provide the solution?  \n\nWouldn't this require a hyper-sphere.  In 3-space, 4 points over specifies\na sphere as far as I can see.  Unless that is you can prove that a point\nexists in 3-space that is equi-distant from the 4 points, and this may not\nnecessarily happen.\n\nCorrect me if I'm wrong (which I quite possibly am!)\n\nsteve\n---\n\n\n\n-- \n+---------------------------------------+--------------------------------+\n| Steven Collins\t\t\t| email: steve@crl.hitachi.co.jp |\n| Visiting Computer Graphics Researcher\t| phone: (0423)-23-1111 \t |\n| Hitachi Central Research Lab. Tokyo.\t| fax:   (0423)-27-7742\t\t |\n"
 "From: gunning@cco.caltech.edu (Kevin J. Gunning)\nSubject: stolen CBR900RR\nOrganization: California Institute of Technology, Pasadena\nLines: 12\nDistribution: usa\nNNTP-Posting-Host: alumni.caltech.edu\nSummary: see above\n\nStolen from Pasadena between 4:30 and 6:30 pm on 4/15.\n\nBlue and white Honda CBR900RR california plate KG CBR.   Serial number\nJH2SC281XPM100187, engine number 2101240.\n\nNo turn signals or mirrors, lights taped over for track riders session\nat Willow Springs tomorrow.  Guess I'll miss it.  :-(((\n\nHelp me find my baby!!!\n\nkjg\n\n"].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

### 4) Decision Tree Classifier

In [35]:
pipeline=Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('dt',tree.DecisionTreeClassifier())])

In [40]:
parameters={
    'dt__criterion' :('gini', 'entropy'),
    'dt__max_depth': (5,10),
    'dt__min_samples_leaf' :(1,3,5)
                   }

In [41]:
grid=GridSearchCV(pipeline,parameters,cv=3,n_jobs=-1,verbose=1)

In [42]:
grid.fit(data.data, data.target)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('dt', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'dt__criterion': ('gini', 'entropy'),
                         'dt__max_depth': (5, 10),
                         'dt__min_samples_leaf': (1, 3, 5)},
             verbose=1)

In [46]:
grid.best_score_

0.3210192358400357

In [45]:
y_pred=grid.predict(data_y.data)

In [49]:
accuracy_score(data_y.target,y_pred)

0.30018587360594795

### 5) SVM

In [67]:
pipeline=Pipeline([
    ('tfidf',TfidfTransformer()),
    ('svc',SVC())])

In [71]:
parameters={
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf']}

In [72]:
grid=GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1,verbose=1)

In [80]:
train_data=data.target.reshape(1,-1)

In [82]:
grid.fit(data.data, data.target.reshape(1,-1))

ValueError: Found input variables with inconsistent numbers of samples: [11314, 1]

In [71]:
pipeline=Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('svc',SVC())])

In [84]:
parameters={
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf']}

In [85]:
grid=GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1,verbose=1)

### Works forever

In [86]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 

In [88]:
parameters={
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1, 0.1, 0.01],
    'svc__kernel': ['rbf']}

In [89]:
grid=GridSearchCV(pipeline,parameters,cv=3,n_jobs=-1,verbose=1)

In [90]:
grid.fit(data.data, data.target)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


KeyboardInterrupt: 

In [72]:
parameters={
    'vect__max_df': (0.5, 0.75, 1.0),
    'svc__kernel': ('rbf',)}

In [73]:
grid=GridSearchCV(pipeline,parameters,cv=3,n_jobs=-1,verbose=1)

In [74]:
grid.fit(data.data, data.target)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'svc__kernel': ('rbf',),
                         'vect__max_df': (0.5, 0.75, 1.0)},
             verbose=1)

In [75]:
grid.best_score_

0.8909318374426177

In [76]:
y_pred=grid.predict(data_y.data)

In [77]:
accuracy_score(data_y.target,y_pred)

0.8224907063197026

In [12]:
grid.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(max_df=0.5)),
                ('tfidf', TfidfTransformer()), ('svc', SVC())])

### 5)Multinomial Naive Bayes

In [78]:
pipeline=Pipeline([
    ('vect',CountVectorizer()),
    ('nb',MultinomialNB())])

In [79]:
parameters={
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'nb__alpha': (1, 0.1, 0.01, 0.001) }

In [80]:
grid=GridSearchCV(pipeline,parameters,cv=3,n_jobs=-1,verbose=1)

In [81]:
grid.fit(data.data, data.target)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (1, 0.1, 0.01, 0.001),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [82]:
grid.best_score_

0.8960580265067289

In [83]:
y_pred=grid.predict(data_y.data)

In [84]:
accuracy_score(data_y.target,y_pred)

0.8179766330323951

In [85]:
pipeline=Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('nb',MultinomialNB())])

In [86]:
parameters={
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'nb__alpha': (1, 0.1, 0.01, 0.001) }

In [87]:
grid=GridSearchCV(pipeline,parameters,cv=3,n_jobs=-1,verbose=1)

In [88]:
grid.fit(data.data, data.target)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (1, 0.1, 0.01, 0.001),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [89]:
grid.best_score_

0.9071060198859054

In [90]:
grid.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(max_df=0.75, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('nb', MultinomialNB(alpha=0.01))])

In [91]:
y_pred=grid.predict(data_y.data)

In [92]:
accuracy_score(data_y.target,y_pred)

0.8349707912904939

In [97]:
def gridpipeline(pipeline,parameters):
    grid=GridSearchCV(pipeline,parameters,cv=3,n_jobs=-1,verbose=1)
    grid.fit(data.data, data.target)
    best_score=grid.best_score_
    y_pred=grid.predict(data_y.data)
    accuracy=accuracy_score(data_y.target,y_pred)
    return best_score,accuracy

In [98]:
best,accuracy=gridpipeline(pipeline,parameters)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [99]:
best,accuracy

(0.9071060198859054, 0.8349707912904939)