In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## LOAD DATASET

In [2]:
train = pd.read_csv("./data/train.tsv", sep = "\t", index_col = "PhraseId")
train["Phrase(original)"] = train["Phrase"]
print(train.shape)
train.head(10)

(156060, 4)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(original)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,A series of escapades demonstrating the adage ...,1,A series of escapades demonstrating the adage ...
2,1,A series of escapades demonstrating the adage ...,2,A series of escapades demonstrating the adage ...
3,1,A series,2,A series
4,1,A,2,A
5,1,series,2,series
6,1,of escapades demonstrating the adage that what...,2,of escapades demonstrating the adage that what...
7,1,of,2,of
8,1,escapades demonstrating the adage that what is...,2,escapades demonstrating the adage that what is...
9,1,escapades,2,escapades
10,1,demonstrating the adage that what is good for ...,2,demonstrating the adage that what is good for ...


In [3]:
test = pd.read_csv("./data/test.tsv", sep = "\t", index_col = "PhraseId")
test["Phrase(original)"] = train["Phrase"]
print(test.shape)
test.head(10)


(66292, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Phrase(original)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,An intermittently pleasing but mostly routine ...,
156062,8545,An intermittently pleasing but mostly routine ...,
156063,8545,An,
156064,8545,intermittently pleasing but mostly routine effort,
156065,8545,intermittently pleasing but mostly routine,
156066,8545,intermittently pleasing but,
156067,8545,intermittently pleasing,
156068,8545,intermittently,
156069,8545,pleasing,
156070,8545,but,


## PREPROCESSING

## Cleantext -- 1

In [4]:
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [5]:
def stem_phrase(phrase):
    words = phrase.split(" ")
    stemmed_words = []

    for word in words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)

    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

# Train
tqdm.pandas(desc="Stemming... (train)")

train["Phrase"] = train["Phrase"].progress_apply(stem_phrase)

print(train.shape)
train.head()

# Test
tqdm.pandas(desc = "Stemming... (test)")

test["Phrase"] = test["Phrase"].progress_apply(stem_phrase)

print(test.shape)
test.head()


Stemming... (train): 100%|██████████| 156060/156060 [00:16<00:00, 9681.99it/s] 
Stemming... (test):   2%|▏         | 1141/66292 [00:00<00:05, 11408.49it/s]

(156060, 4)


Stemming... (test): 100%|██████████| 66292/66292 [00:06<00:00, 10157.59it/s]

(66292, 3)





Unnamed: 0_level_0,SentenceId,Phrase,Phrase(original)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,an intermitt pleas but most routin effort .,
156062,8545,an intermitt pleas but most routin effort,
156063,8545,an,
156064,8545,intermitt pleas but most routin effort,
156065,8545,intermitt pleas but most routin,


# Cleantext -- 2

In [6]:
def clean_text(phrase):
    phrase = phrase.replace("n't", "not")
    phrase = phrase.replace("hopeless", "bad")
    phrase = phrase.replace("good", "best")
    phrase = phrase.replace("excellent", "best")
    phrase = phrase.replace("funni", "fun")
    phrase = phrase.replace("funny", "fun")
    phrase = phrase.replace("littl", "little")
    phrase = phrase.replace("the movi", "movie")
    phrase = phrase.replace("veri", "very")
    phrase = phrase.replace("onli", "only")
    phrase = phrase.replace("comedi", "comedy")
    phrase = phrase.replace("veri", "very")
    phrase = phrase.replace("stori", "story")
    phrase = phrase.replace("charact", "character")
    
    #Hypothesis
    phrase = phrase.replace("not", "no")
    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)
test["Phrase"] = test["Phrase"].apply(clean_text)


## one Hot Encode Phrases

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = TfidfVectorizer(analyzer = 'char', 
                                  max_features = 10000, 
                                  ngram_range = (1,9))

char_vectorizer.fit(train["Phrase"])
%time

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 10 µs


In [8]:
word_vectorizer = TfidfVectorizer(analyzer = 'word', 
                                  max_features = 30000, 
                                  ngram_range = (1,4))

word_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

### Fit and Transform 

In [9]:
X_train_word = word_vectorizer.transform(train["Phrase"])
print(X_train_word.shape)
X_train_char = char_vectorizer.transform(train["Phrase"])
print(X_train_char.shape)

from scipy.sparse import hstack #vstack, hstack (word, character to merge )
X_train = hstack([X_train_char, X_train_word])
print(X_train.shape)
X_train

(156060, 30000)
(156060, 10000)
(156060, 40000)


<156060x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 19680572 stored elements in COOrdinate format>

In [10]:
X_test_word = word_vectorizer.transform(test["Phrase"])
print(X_test_word.shape)
X_test_char = char_vectorizer.transform(test["Phrase"])
print(X_test_char.shape)

X_test = hstack([X_test_char, X_test_word])
print(X_test.shape)
X_test

(66292, 30000)
(66292, 10000)
(66292, 40000)


<66292x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 7612200 stored elements in COOrdinate format>

In [11]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

## Score 

In [12]:
from sklearn.linear_model import SGDClassifier

seed = 23

model = SGDClassifier(n_jobs=-1,
                      alpha = 0.00005,
                      random_state=seed)

model

SGDClassifier(alpha=5e-05, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=-1, penalty='l2', power_t=0.5, random_state=23, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [13]:
#cross validation

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
# http://scikit-learn.org/stable/modules/classes.html
predictions = cross_val_predict(model, X_train, y_train, cv = 5)

score = accuracy_score(y_train, predictions)

print("Score = {0:5f}".format(score))




Score = 0.592452


In [14]:
from sklearn.model_selection import cross_val_score, GroupKFold
# from sklearn.cross_validation import cross_val_score, GroupKFold

kfold = GroupKFold(n_splits=5)

score = cross_val_score(model, X_train, y_train, cv=kfold, groups=train["SentenceId"]).mean()

print("Score = {0:.5f}".format(score))



Score = 0.59852


## Predictions

In [15]:
import xgboost as xgb

In [16]:
dtrain = xgb.DMatrix(X_train, label = y_train)

In [17]:
params = {
    'booster': 'gblinear',
    'objective': 'multi:softmax',
    'lambda': 2.186753e-03,
    'alpha': 1.286904,
    'lambda_bias': 6.191707e+00,
    'num_class': 5,
    'nthread':2,
}

booster = xgb.train(params, dtrain, num_boost_round=98)
booster

<xgboost.core.Booster at 0x1a1887e630>

In [18]:
dtest = xgb.DMatrix(X_test.toarray())

In [19]:
predictions = booster.predict(dtest)
print(predictions.shape)
predictions[0:10]

(66292,)


array([ 3.,  3.,  2.,  3.,  3.,  3.,  3.,  2.,  3.,  2.], dtype=float32)

# Submit

In [20]:
submission = pd.read_csv("./data/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3.0
156062,3.0
156063,2.0
156064,3.0
156065,3.0


In [21]:
submission.to_csv("tfidf-xgboost.csv")