In [37]:
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob, Word
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import f1_score

In [2]:
stemmer = SnowballStemmer('english')

In [5]:
#define functions here

def word_tokenize(text, how = 'lemma'):
    words = TextBlob(text).words
    if how == 'lemma':
        return [word.lemmatize() for word in words]
    elif how == 'stem':
        return [stemmer.stem(word) for word in words]
    
def spelling(text):
    words = TextBlob(text).words
    return [word.correct() for word in words]

In [4]:
#read in all csv files

bio = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/biology.csv')
cooking = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/cooking.csv')
crypto = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/crypto.csv')
diy = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/diy.csv')
robot = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/robotics.csv')
travel = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/travel.csv')

In [39]:
bio['content_decoded'] = bio['content'].str.decode('utf-8', errors = 'ignore').str.lower()
bio['content_decoded'] = bio['content_decoded'].str.replace('<p>', '')
bio.head()

Unnamed: 0,id,title,content,tags,content_decoded
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...,"in prokaryotic translation, how critical for e..."
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry,does anyone have any suggestions to prevent rn...
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology,tortora writes in <em>principles of anatomy an...
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture,various people in our lab will prepare a liter...
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons,are there any cases in which the splicing mach...


In [40]:
bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13196 entries, 0 to 13195
Data columns (total 5 columns):
id                 13196 non-null int64
title              13196 non-null object
content            13196 non-null object
tags               13196 non-null object
content_decoded    13196 non-null object
dtypes: int64(1), object(4)
memory usage: 515.5+ KB


In [41]:
#null accuracy

null = bio.tags.value_counts() / bio.shape[0]
null.head()

evolution        0.012807
human-biology    0.012504
genetics         0.012428
biochemistry     0.008184
cell-biology     0.006896
Name: tags, dtype: float64

In [42]:
x_train, x_test, y_train, y_test = train_test_split(bio.content_decoded, bio.tags, random_state = 1)
x_train.head()

6474     suppose you haven't hit your nails so the whit...
6491     <ol>\n<li>a multivalent antibody molecule such...
272      what is known about the targets of golgi stain...
11469    antioxidants such as ascorbate and glutathione...
11950    i'm trying to establish if it's required to ad...
Name: content_decoded, dtype: object

In [43]:
vect = CountVectorizer(stop_words = 'english', analyzer = lambda x: word_tokenize(x, how = 'stem')) #troubleshoot stop_words
x_train_dtm = vect.fit_transform(x_train)
x_test_dtm = vect.transform(x_test)

In [21]:
#examine the vocabulary and document-term matrix together

train_arr = x_train_dtm.toarray()
train_features = vect.get_feature_names()
pd.DataFrame(x_train_dtm.toarray(), columns = vect.get_feature_names()).head()

Unnamed: 0,'3,'5,'a,'b,'c,'d,'detect,'e,'g,'h,...,➕,➖,➜,➡,➡++gfe,ヒト,螳螂捕蝉，黄雀在后,说苑,？,￼￼￼￼￼
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
#verify shape of train and test

print x_train_dtm.shape
print x_test_dtm.shape

(9897, 37393)
(3299, 37393)


In [28]:
#create df with count and token

x_train_token_counts = pd.DataFrame({'Token': train_features, 'Count': np.sum(train_arr, axis = 0)})
x_train_token_counts.sort_values(by = 'Count', ascending = False).head()

Unnamed: 0,Count,Token
25300,57908,p
32626,47021,the
3858,30199,a
24628,24949,of
32974,23615,to


In [44]:
#instantiate and fit the model

nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
#check accuracy of predictions

predictions = nb.predict(x_test_dtm)
print metrics.accuracy_score(y_test, predictions)

0.0233404061837


In [46]:
f1_score(y_test, predictions, average = 'weighted')

0.0026287138833499075