In [1]:
import pandas as pd

## Load Dataset

In [2]:
train = pd.read_csv("/Users/Joohyung/kaggle/sentiment/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head(10)

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2
6,1,of escapades demonstrating the adage that what...,2
7,1,of,2
8,1,escapades demonstrating the adage that what is...,2
9,1,escapades,2
10,1,demonstrating the adage that what is good for ...,2


In [3]:
test = pd.read_csv("/Users/Joohyung/kaggle/sentiment/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [4]:
train["Phrase(origin)"] = train["Phrase"]

print(train.shape)
train.head()

(156060, 4)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,A series of escapades demonstrating the adage ...,1,A series of escapades demonstrating the adage ...
2,1,A series of escapades demonstrating the adage ...,2,A series of escapades demonstrating the adage ...
3,1,A series,2,A series
4,1,A,2,A
5,1,series,2,series


In [5]:
test["Phrase(origin)"] = test["Phrase"]

print(test.shape)
test.head()

(66292, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,8545,An,An
156064,8545,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [6]:
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm

stemmer = SnowballStemmer('english')

def stem_phrase(phrase):
    words = phrase.split(" ")
    stemmed_words = []

    for word in words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)

    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

tqdm.pandas(desc="Stemming... (train)")

train["Phrase"] = train["Phrase"].progress_apply(stem_phrase)

print(train.shape)
train.head()

Stemming... (train): 100%|██████████| 156060/156060 [00:19<00:00, 8118.22it/s]

(156060, 4)





Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,a seri of escapad demonstr the adag that what ...,1,A series of escapades demonstrating the adage ...
2,1,a seri of escapad demonstr the adag that what ...,2,A series of escapades demonstrating the adage ...
3,1,a seri,2,A series
4,1,a,2,A
5,1,seri,2,series


In [7]:
tqdm.pandas(desc="Stemming... (test)")

test["Phrase"] = test["Phrase"].progress_apply(stem_phrase)

print(test.shape)
test.head()

Stemming... (test): 100%|██████████| 66292/66292 [00:07<00:00, 8400.75it/s]


(66292, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,an intermitt pleas but most routin effort .,An intermittently pleasing but mostly routine ...
156062,8545,an intermitt pleas but most routin effort,An intermittently pleasing but mostly routine ...
156063,8545,an,An
156064,8545,intermitt pleas but most routin effort,intermittently pleasing but mostly routine effort
156065,8545,intermitt pleas but most routin,intermittently pleasing but mostly routine


In [8]:
def clean_text(phrase):
    # phrase = phrase.replace("disappointments", "disappointment")
    phrase = phrase.replace("n't", "not")
    phrase = phrase.replace("it's", "it is")
    
    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

### One hot encode Phrase

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk


vectorizer = CountVectorizer(max_features=30000, min_df=1, 
                             ngram_range=(1, 2))

vectorizer.fit(train["Phrase"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
X_train = vectorizer.transform(train["Phrase"])

print(X_train.shape)
X_train

(156060, 30000)


<156060x30000 sparse matrix of type '<class 'numpy.int64'>'
	with 1269919 stored elements in Compressed Sparse Row format>

In [11]:
X_test = vectorizer.transform(test["Phrase"])

print(X_test.shape)
X_test

(66292, 30000)


<66292x30000 sparse matrix of type '<class 'numpy.int64'>'
	with 474152 stored elements in Compressed Sparse Row format>

In [12]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

In [13]:
train_vector = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names())

print(train_vector.shape)
train_vector.head()

(156060, 30000)


Unnamed: 0,10,10 minut,10 or,10 second,10 set,10 year,100,100 minut,100 year,10000,...,zing,zinger,zippi,zish,zone,zone and,zone arm,zone episod,zucker,zucker brothers
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Score

In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

lsvc=Pipeline([('cvec',CountVectorizer()),('tfidf',TfidfTransformer()),
                          ('clf',LinearSVC(dual=False))])

lsvc.set_params(clf__penalty='l1',clf__C=0.3,tfidf__use_idf=False,cvec__ngram_range=(1,2)
                           ).fit(train.Phrase,train.Sentiment)
predictions=lsvc.predict(test.Phrase)

predictions.shape

(66292,)

## Predict

## Submit

In [15]:
submission = pd.read_csv("/Users/Joohyung/kaggle/sentiment/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,3


In [16]:
submission.to_csv("/Users/Joohyung/submit0909.csv")