In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
horror_train_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/horror-train.csv')

In [3]:
horror_train_data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
horror_test_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/horror-test.csv')

In [5]:
horror_test_data.iloc[1]['text']

'If a fire wanted fanning, it could readily be fanned with a newspaper, and as the government grew weaker, I have no doubt that leather and iron acquired durability in proportion, for, in a very short time, there was not a pair of bellows in all Rotterdam that ever stood in need of a stitch or required the assistance of a hammer.'

In [6]:
horror_test_data.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [7]:
horror_train_data = horror_train_data[['text','author']]

In [8]:
horror_train_data.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",EAP
1,It never once occurred to me that the fumbling...,HPL
2,"In his left hand was a gold snuff box, from wh...",EAP
3,How lovely is spring As we looked from Windsor...,MWS
4,"Finding nothing else, not even gold, the Super...",HPL


In [9]:
from sklearn.pipeline import make_pipeline

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [11]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer

In [12]:
pipelines = []
for model in [LogisticRegression(), DecisionTreeClassifier(), MultinomialNB(), LinearSVC()]:
    pipeline = make_pipeline(
              CountVectorizer(stop_words='english'),
              TfidfTransformer(),
              model)
    pipelines.append(pipeline)

In [13]:
pipelines[1].steps[2]

('decisiontreeclassifier',
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'))

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
trainX,testX,trainY,testY = train_test_split(horror_train_data.text,horror_train_data.author)

In [16]:
for pipeline in pipelines:
    pipeline.fit(trainX, trainY)



In [17]:
for pipeline in pipelines:
    print (pipeline.score(testX, testY))

0.7885597548518897
0.6126659856996935
0.8120531154239019
0.8096016343207354


In [18]:

results = []
for pipeline in pipelines:
    result = pipeline.predict(horror_test_data.text)
    results.append(result)

In [19]:
results

[array(['MWS', 'EAP', 'EAP', ..., 'EAP', 'MWS', 'EAP'], dtype=object),
 array(['MWS', 'EAP', 'EAP', ..., 'EAP', 'MWS', 'EAP'], dtype=object),
 array(['MWS', 'EAP', 'EAP', ..., 'EAP', 'MWS', 'HPL'], dtype='<U3'),
 array(['MWS', 'EAP', 'EAP', ..., 'EAP', 'MWS', 'HPL'], dtype=object)]

In [20]:
pipeline.predict(horror_test_data.text[2:5])


array(['EAP', 'EAP', 'EAP'], dtype=object)

In [22]:
horror_test_data.text[2:5]

2    And when they had broken down the frail door t...
3    While I was thinking how I should possibly man...
4    I am not sure to what limit his knowledge may ...
Name: text, dtype: object