In [53]:
import nltk
import pandas
import re
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.externals import joblib

In [54]:
df = pandas.read_csv('../sentiment_training_set.csv', encoding='ISO-8859-1')

In [55]:
df.drop('id', axis=1, inplace=True)

In [56]:
stopwords = nltk.corpus.stopwords.words('english') + ['and']
emoji_pattern = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
    "+", flags=re.UNICODE)

def sanitize(tweet_text):
    # remove urls from tweet string
    tweet_text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet_text)
    # strip punctuation
    tweet_text = re.sub(r'[^a-zA-Z]', ' ', tweet_text)
    # strip emojiis
    tweet_text = emoji_pattern.sub(r'', tweet_text)
    # remove stop words from nltk corpus
    tweet_text = ' '.join(w for w in tweet_text.strip().lower().split() if not w in stopwords)
    return tweet_text

In [57]:
df['text'].count()

1048575

In [58]:
%%time
# sanitize the input data
df['text'] = df['text'].map(sanitize)

CPU times: user 38.5 s, sys: 52 ms, total: 38.6 s
Wall time: 38.6 s


In [61]:
%%time
vectorizer = TfidfVectorizer(min_df=1, max_df=0.8, sublinear_tf=True, use_idf=True, decode_error='ignore')
train_vectors = vectorizer.fit_transform(df['text'])
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

CPU times: user 20.6 s, sys: 148 ms, total: 20.7 s
Wall time: 20.6 s


In [63]:
joblib.dump(train_vectors, 'train_vectors.pkl')
train_vectors.shape

(1048575, 467548)

In [64]:
train_vectors = joblib.load('train_vectors.pkl')

In [68]:
def evaluate_clf(clf, X_train=train_vectors, y_train=df['sentiment']):
    xval_scores = cross_val_score(clf, X_train, y_train)
    print('Cross Val Scores:', xval_scores, '   Average:', xval_scores.mean())
    return xval_scores

In [69]:
%%time
evaluate_clf(SGDClassifier(loss='hinge'))  # linear SVM (hinge loss)

Cross Val Scores: [ 0.7069374   0.73405336  0.75191117]    Average: 0.730967307988
CPU times: user 4.96 s, sys: 28 ms, total: 4.99 s
Wall time: 4.99 s


array([ 0.7069374 ,  0.73405336,  0.75191117])

In [70]:
%%time
evaluate_clf(SGDClassifier(loss='squared_loss'))  # linear classifier

Cross Val Scores: [ 0.72942499  0.75156284  0.76239686]    Average: 0.747794895903
CPU times: user 4.83 s, sys: 48 ms, total: 4.88 s
Wall time: 4.88 s


array([ 0.72942499,  0.75156284,  0.76239686])

In [71]:
%%time
evaluate_clf(SGDRegressor(loss='squared_loss'))  # linear regression

Cross Val Scores: [ 0.1878434   0.2129654   0.22280161]    Average: 0.207870137382
CPU times: user 4.73 s, sys: 32 ms, total: 4.76 s
Wall time: 4.76 s


array([ 0.1878434 ,  0.2129654 ,  0.22280161])

In [72]:
clf = SGDClassifier(loss='squared_loss')
clf.fit(train_vectors, df['sentiment'])

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='squared_loss', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [73]:
reg = SGDRegressor(loss='squared_loss')
reg.fit(train_vectors, df['sentiment'])

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

In [74]:
x_test = vectorizer.transform(['I love my job'])
print(clf.predict(x_test))
print(reg.predict(x_test))

[1]
[ 0.92997265]


In [75]:
x_test = vectorizer.transform(['I hate this fucking place'])
print(clf.predict(x_test))
print(reg.predict(x_test))

[0]
[ 0.11978302]


In [77]:
x_test = vectorizer.transform(['I would like an orange juice please'])
print(clf.predict(x_test))
print(reg.predict(x_test))

[1]
[ 0.52676642]


In [78]:
joblib.dump(reg, 'reg.pkl')

['reg.pkl']

In [79]:
joblib.dump(clf, 'clf.pkl')

['clf.pkl']