In [65]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

## Load the data in, convert to tf-idf vectors

In [4]:
train = pd.read_table('../data/train.tsv',sep='\t')

In [5]:
test = pd.read_table('../data/test.tsv', sep='\t')

In [6]:
train_phrases = list(train['Phrase'].values)

In [7]:
test_phrases = list(test['Phrase'].values)

In [8]:
class StemTokenizer(object):
    def __init__(self):
        self.portstem = PorterStemmer()
    def __call__(self, doc):
        return [self.portstem.stem(t) for t in word_tokenize(doc)]

In [9]:
vectorizer = CountVectorizer(stop_words='english', tokenizer = StemTokenizer())

In [10]:
train_features = vectorizer.fit_transform(train_phrases)

In [11]:
test_features = vectorizer.transform(test_phrases)

In [12]:
train_sentiments = list(train['Sentiment'].values)

## Split train matrix into a train/test sit for model selection & optimization

In [20]:
#todo

## Model testing & selection

### Multinomial NB

In [16]:
nb = MultinomialNB()

In [17]:
nb.fit(train_features, train_sentiments)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [169]:
#post-stem_accuracy = 0.59844
#post-stem_position = 473/861

In [18]:
predictions = nb.predict(test_features)

In [19]:
df = pd.DataFrame(predictions)

In [22]:
df['PhraseId'] = test['PhraseId']

In [24]:
df.columns = ['Sentiment', 'PhraseId']

In [25]:
df = df[['PhraseId', 'Sentiment']]

In [26]:
df.to_csv('../data/mnb_preds.csv', index=False)

### Stochastic Gradient Descent Classifier (SVM)

In [23]:
sgd = SGDClassifier()

In [24]:
sgd.fit(train_features, train_sentiments)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [41]:
#post_stem_accuracy = 0.58126
#post_stem_position = 541

In [26]:
predictions = sgd.predict(test_features)

In [27]:
df = pd.DataFrame(predictions)

In [28]:
df['PhraseId'] = test['PhraseId']

In [29]:
df.columns = ['Sentiment', 'PhraseId']

In [30]:
df = df[['PhraseId', 'Sentiment']]

In [31]:
df.to_csv('../data/sgd_preds.csv', index=False)

### KNeighborsClassifier

*too slow to run on pc. Try on large (optimized) EC2 instance*

In [36]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [37]:
neigh.fit(train_features, train_sentiments)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [2]:
#predictions = neigh.predict(test_features)

In [None]:
df = pd.DataFrame(predictions)

In [None]:
df.columns

In [None]:
df['PhraseId'] = test['PhraseId']

In [None]:
df.columns = ['Sentiment', 'PhraseId']

In [None]:
df = df[['PhraseId', 'Sentiment']]

In [None]:
df.to_csv('../data/knn_preds.csv', index=False)

## Decision Tree Classifier

In [32]:
clf = DecisionTreeClassifier()

In [33]:
clf.fit(train_features, train_sentiments)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [34]:
#accuracy = 0.57806
#position = 550

In [35]:
predictions = clf.predict(test_features)

In [36]:
df = pd.DataFrame(predictions)

In [37]:
df['PhraseId'] = test['PhraseId']

In [38]:
df.columns = ['Sentiment', 'PhraseId']

In [39]:
df = df[['PhraseId', 'Sentiment']]

In [40]:
df.to_csv('../data/dt_preds.csv', index=False)

### Random Forest Classifier

In [44]:
rfc = RandomForestClassifier()

In [45]:
rfc.fit(train_features, train_sentiments)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [46]:
#accuracy = 0.58331
#position = 524

In [47]:
predictions = rfc.predict(test_features)

In [48]:
df = pd.DataFrame(predictions)

In [49]:
df['PhraseId'] = test['PhraseId']

In [50]:
df.columns = ['Sentiment', 'PhraseId']

In [51]:
df = df[['PhraseId', 'Sentiment']]

In [52]:
df.to_csv('../data/rfc_preds.csv', index=False)

### Gradient Boosting Classifier

In [54]:
gbc = GradientBoostingClassifier()

In [55]:
gbc.fit(train_features, train_sentiments)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [64]:
#accuracy = 0.54581
#position = 646

In [58]:
predictions = gbc.predict(test_features.toarray())

In [59]:
df = pd.DataFrame(predictions)

In [60]:
df['PhraseId'] = test['PhraseId']

In [61]:
df.columns = ['Sentiment', 'PhraseId']

In [62]:
df = df[['PhraseId', 'Sentiment']]

In [63]:
df.to_csv('../data/gbc_preds.csv', index=False)

### Neural Network Classifier (MLP)

In [66]:
nnc = MLPClassifier()

In [None]:
nnc.fit(train_features, train_sentiments)

In [64]:
#accuracy = 0.54581
#position = 646

In [None]:
predictions = nnc.predict(test_features.toarray())

In [None]:
df = pd.DataFrame(predictions)

In [None]:
df['PhraseId'] = test['PhraseId']

In [None]:
df.columns = ['Sentiment', 'PhraseId']

In [None]:
df = df[['PhraseId', 'Sentiment']]

In [None]:
df.to_csv('../data/nnc_preds.csv', index=False)