In [172]:
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob, Word
from BeautifulSoup import BeautifulSoup
import re
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

In [157]:
stemmer = SnowballStemmer('english')

In [158]:
#define functions here

def clean_html(raw_html):
    for index, row in raw_html.iterrows():
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', row['content_decoded'])
        return cleantext

def word_tokenize(text, how = 'lemma'):
    words = TextBlob(text).words
    if how == 'lemma':
        return [word.lemmatize() for word in words]
    elif how == 'stem':
        return [stemmer.stem(word) for word in words]
    
def spelling(text):
    words = TextBlob(text).words
    return [word.correct() for word in words]

In [147]:
#read in all csv files

bio = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/biology.csv')
cooking = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/cooking.csv')
crypto = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/crypto.csv')
diy = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/diy.csv')
robot = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/robotics.csv')
travel = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/travel.csv')

In [148]:
bio['content_decoded'] = (bio['title'] + ' ' + bio['content'])
bio['content_decoded'] = bio['content_decoded'].str.decode('utf-8', errors = 'ignore').str.lower()
bio.head()

Unnamed: 0,id,title,content,tags,content_decoded
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...,what is the criticality of the ribosome bindin...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry,how is rnase contamination in rna based experi...
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology,are lymphocyte sizes clustered in two groups? ...
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture,how long does antibiotic-dosed lb maintain goo...
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons,is exon order always preserved in splicing? <p...


In [159]:
bio['content_decoded'] = bio['content_decoded'].str.replace('<.*?>', ' ')
bio['content_decoded'] = bio['content_decoded'].str.replace('/n', ' ')
bio.head()

Unnamed: 0,id,title,content,tags,content_decoded,Content_decoded
0,1,What is the criticality of the ribosome bindin...,"In prokaryotic translation, how critical for e...",ribosome binding-sites translation synthetic-b...,what is the criticality of the ribosome bindin...,what is the criticality of the ribosome bindin...
1,2,How is RNAse contamination in RNA based experi...,Does anyone have any suggestions to prevent RN...,rna biochemistry,how is rnase contamination in rna based experi...,how is rnase contamination in rna based experi...
2,3,Are lymphocyte sizes clustered in two groups?,Tortora writes in Principles of Anatomy and Ph...,immunology cell-biology hematology,are lymphocyte sizes clustered in two groups? ...,are lymphocyte sizes clustered in two groups? ...
3,4,How long does antibiotic-dosed LB maintain goo...,Various people in our lab will prepare a liter...,cell-culture,how long does antibiotic-dosed lb maintain goo...,how long does antibiotic-dosed lb maintain goo...
4,5,Is exon order always preserved in splicing?,Are there any cases in which the splicing mach...,splicing mrna spliceosome introns exons,is exon order always preserved in splicing? ar...,is exon order always preserved in splicing? ar...


In [160]:
bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13196 entries, 0 to 13195
Data columns (total 6 columns):
id                 13196 non-null int64
title              13196 non-null object
content            13196 non-null object
tags               13196 non-null object
content_decoded    13196 non-null object
Content_decoded    13196 non-null object
dtypes: int64(1), object(5)
memory usage: 618.6+ KB


In [161]:
#null accuracy

null = bio.tags.value_counts() / bio.shape[0]
null.head()

evolution        0.012807
human-biology    0.012504
genetics         0.012428
biochemistry     0.008184
cell-biology     0.006896
Name: tags, dtype: float64

In [162]:
x_train, x_test, y_train, y_test = train_test_split(bio.content_decoded, bio.tags, random_state = 1)
x_train.head()

6474     what is the evolutionary purpose of white spot...
6491     binding of multivalent antibody to mutiple epi...
272      how does golgi's neural histological stain wor...
11469    cellular demand for antioxidants antioxidants ...
11950    does cas9 require a nuclear localization signa...
Name: content_decoded, dtype: object

In [163]:
vect = CountVectorizer(stop_words = 'english', analyzer = lambda x: word_tokenize(x, how = 'stem')) #troubleshoot stop_words
x_train_dtm = vect.fit_transform(x_train)
x_test_dtm = vect.transform(x_test)

In [164]:
#examine the vocabulary and document-term matrix together

train_arr = x_train_dtm.toarray()
train_features = vect.get_feature_names()
pd.DataFrame(x_train_dtm.toarray(), columns = vect.get_feature_names()).head()

Unnamed: 0,'3,'5,'a,'b,'c,'d,'detect,'e,'g,'h,...,➕,➖,➜,➡,➡++gfe,ヒト,螳螂捕蝉，黄雀在后,说苑,？,￼￼￼￼￼
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
#verify shape of train and test

print x_train_dtm.shape
print x_test_dtm.shape

(9897, 32590)
(3299, 32590)


In [166]:
#create df with count and token

x_train_token_counts = pd.DataFrame({'Token': train_features, 'Count': np.sum(train_arr, axis = 0)})
x_train_token_counts.sort_values(by = 'Count', ascending = False).head()

Unnamed: 0,Count,Token
29111,50276,the
21456,27812,of
29447,25368,to
3458,23380,a
15759,21923,i


In [167]:
#instantiate and fit the model

nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [168]:
#check accuracy of predictions

predictions = nb.predict(x_test_dtm)
print metrics.accuracy_score(y_test, predictions)

0.0248560169748


In [169]:
f1_score(y_test, predictions, average = 'weighted')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.0041656583890502764

In [174]:
logreg = LogisticRegression()
logreg.fit(x_train_dtm, y_train)
predictions = logreg.predict(x_test_dtm)
metrics.accuracy_score(y_test, predictions)

0.057290087905425884

In [175]:
f1_score(y_test, predictions, average = 'weighted')

0.024478285824780181