In [1]:
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob, Word
from BeautifulSoup import BeautifulSoup
import re
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

In [2]:
stemmer = SnowballStemmer('english')

In [3]:
#define functions here

def clean_html(raw_html):
    for index, row in raw_html.iterrows():
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', row['content_decoded'])
        return cleantext

def word_tokenize(text, how = 'lemma'):
    words = TextBlob(text).words
    if how == 'lemma':
        return [word.lemmatize() for word in words]
    elif how == 'stem':
        return [stemmer.stem(word) for word in words]
    
def spelling(text):
    words = TextBlob(text).words
    return [word.correct() for word in words]

In [4]:
#read in all csv files

bio = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/biology.csv')
cooking = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/cooking.csv')
crypto = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/crypto.csv')
diy = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/diy.csv')
robot = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/robotics.csv')
travel = pd.read_csv('https://raw.githubusercontent.com/sglembocki/Machine-Learning/master/StackExchangeLearning/travel.csv')
test = pd.read_csv('C:\Users\Steven\Documents\Git\StackExchangeLearning\\test.csv')
id = test['id']

In [5]:
all = pd.concat([bio, diy, robot], ignore_index = True)

In [6]:
all['content_decoded'] = (all['title'] + ' ' + all['content'])
all['content_decoded'] = all['content_decoded'].str.decode('utf-8', errors = 'ignore').str.lower()
all.head()

Unnamed: 0,id,title,content,tags,content_decoded
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...,what is the criticality of the ribosome bindin...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry,how is rnase contamination in rna based experi...
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology,are lymphocyte sizes clustered in two groups? ...
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture,how long does antibiotic-dosed lb maintain goo...
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons,is exon order always preserved in splicing? <p...


In [37]:
#null accuracy

null = all.tags.value_counts() / all.shape[0]
null.head()

electrical           0.026191
plumbing             0.010720
electrical wiring    0.005945
hvac                 0.005061
wiring               0.004870
Name: tags, dtype: float64

In [7]:
all['content_decoded'] = all['content_decoded'].str.replace('<.*?>', ' ')
all['content_decoded'] = all['content_decoded'].str.replace('/n', ' ')
del all['title']
del all['content']
all.head()

Unnamed: 0,id,tags,content_decoded
0,1,ribosome binding-sites translation synthetic-b...,what is the criticality of the ribosome bindin...
1,2,rna biochemistry,how is rnase contamination in rna based experi...
2,3,immunology cell-biology hematology,are lymphocyte sizes clustered in two groups? ...
3,4,cell-culture,how long does antibiotic-dosed lb maintain goo...
4,5,splicing mrna spliceosome introns exons,is exon order always preserved in splicing? a...


In [39]:
all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41885 entries, 0 to 41884
Data columns (total 3 columns):
id                 41885 non-null int64
tags               41885 non-null object
content_decoded    41885 non-null object
dtypes: int64(1), object(2)
memory usage: 981.8+ KB


In [8]:
x_train, x_test, y_train, y_test = train_test_split(all.content_decoded, all.tags, random_state = 1)
x_train.head()

31181    shower drain issues  my house is 60 years old....
18129    how do i properly ventilate my pantry?  i live...
15981    is it okay to store laminate flooring in a dry...
34469    why did my cooktop trip the breaker when i cle...
26924    what tool will allow me to make a cutout in th...
Name: content_decoded, dtype: object

In [9]:
vect = CountVectorizer(stop_words = 'english', analyzer = lambda x: word_tokenize(x, how = 'stem')) #troubleshoot stop_words
x_train_dtm = vect.fit_transform(x_train)
x_test_dtm = vect.transform(x_test)

In [12]:
#examine the vocabulary and document-term matrix together

train_arr = x_train_dtm.toarray()
train_features = vect.get_feature_names()
pd.DataFrame(x_train_dtm.toarray(), columns = vect.get_feature_names()).head()

MemoryError: 

In [13]:
#verify shape of train and test

print x_train_dtm.shape
print x_test_dtm.shape

(65250, 119108)
(21750, 119108)


In [40]:
#create df with count and token

x_train_token_counts = pd.DataFrame({'Token': train_features, 'Count': np.sum(train_arr, axis = 0)})
x_train_token_counts.sort_values(by = 'Count', ascending = False).head()

Unnamed: 0,Count,Token
29109,50275,the
21451,27812,of
29441,25368,to
3443,23402,a
15763,21943,i


In [10]:
#instantiate and fit the model

nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
#check accuracy of predictions

predictions = nb.predict(x_test_dtm)
print metrics.accuracy_score(y_test, predictions)

0.0248560169748


In [43]:
f1_score(y_test, predictions, average = 'weighted')

0.0041570208805518892

In [18]:
logreg = LogisticRegression()
logreg.fit(x_train_dtm, y_train)
predictions = logreg.predict(x_test_dtm)
metrics.accuracy_score(y_test, predictions)

0.056986965747196118

In [19]:
f1_score(y_test, predictions, average = 'weighted')

0.024605002113270696

In [None]:
f1_score(y_test, predictions, average = 'weighted')

In [11]:
test['content_decoded'] = (test['title'] + ' ' + test['content'])
test['content_decoded'] = test['content_decoded'].str.decode('utf-8', errors = 'ignore').str.lower()
test['content_decoded'] = test['content_decoded'].str.replace('<.*?>', ' ')
test['content_decoded'] = test['content_decoded'].str.replace('/n', ' ')
test = test['content_decoded']
test.head()

0    what is spin as it relates to subatomic partic...
1    what is your simplest explanation of the strin...
2    lie theory, representations and particle physi...
3    will determinism be ever possible?  what are t...
4    hamilton's principle  hamilton's principle sta...
Name: content_decoded, dtype: object

In [14]:
test_dtm = vect.transform(test)

In [None]:
predictions = nb.predict(test_dtm)
result = pd.DataFrame('predictions')

In [52]:
result = pd.concat([id, pd.DataFrame(result)], axis = 1, join_axes = [id.index])
result.head()

Unnamed: 0,id,tags
0,1,evolution
1,2,evolution
2,3,evolution
3,7,genetics
4,9,genetics


In [53]:
result.to_csv('submission.csv', sep = ',', index = False)