In [1]:
%pylab inline
import pandas as pd
from sklearn.utils import resample
from wordcloud import WordCloud
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

Populating the interactive namespace from numpy and matplotlib


In [2]:
file_name='train.tsv'
seperator = '\t'
X_clm_name='Sentiment'
y_clm_name='Phrase'

In [3]:
max_features = 4000
ngram_range = (1,3)
stopwords = None
random_seed = 123

In [4]:
df = pd.read_csv(file_name, sep=seperator)
df = df[df[X_clm_name] != "none"]
df = df[df[X_clm_name] != ""]
df = df[df[X_clm_name].notna()]
df = df[df[y_clm_name] != "none"]
df = df[df[y_clm_name] != ""]
df = df[df[y_clm_name].notna()]
df.head()

  result = method(y)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
df = df[df[y_clm_name].str.len() <= 200]

In [6]:
df[X_clm_name] = df.apply(lambda row: 1 
                           if row[X_clm_name] >= 3 
                           else 0, axis=1
                          )

In [7]:
positives = df[df[X_clm_name] == 1]
positives.count()[0]

41783

In [8]:
negatives = df[df[X_clm_name] == 0]
negatives.count()[0]

113578

In [9]:
n_samples = (negatives.count()[0] 
             if negatives.count()[0] < positives.count()[0] 
             else positives.count()[0]
            )
n_samples

41783

In [10]:
positives = resample(positives, n_samples=n_samples, random_state=random_seed)
positives.count()[0]

41783

In [11]:
negatives = resample(negatives, n_samples=n_samples, random_state=random_seed)
negatives.count()[0]

41783

In [12]:
df = negatives.append(positives)
df.count()[0]

83566

In [13]:
kf = KFold(n_splits=10, shuffle=True, random_state=random_seed)
y = df[X_clm_name]
X_raw = df[y_clm_name]
def do_cross_val(model, X):
    acc_score = cross_val_score(
        model, 
        X, 
        y, 
        cv=kf, 
        scoring='accuracy'
    )
    print('Accuracies:')
    print(acc_score)
    print('Mean accuracy:')
    print(np.mean(acc_score))

In [14]:
tfidf = TfidfVectorizer(
    lowercase=True, 
    stop_words=stopwords, 
    ngram_range=ngram_range,
    max_features=max_features
)

start_time = time.time()

X_tfidf = tfidf.fit_transform(X_raw)

run_time = (time.time() - start_time)
print(f'time taken: {run_time} seconds')

time taken: 1.4952785968780518 seconds


In [15]:
cv = CountVectorizer(
    lowercase=True, 
    stop_words=stopwords, 
    ngram_range=ngram_range,
    max_features=max_features
)
start_time = time.time()

X_cv = cv.fit_transform(X_raw)

run_time = (time.time() - start_time)
print(f'time taken: {run_time} seconds')

time taken: 1.4493730068206787 seconds


In [16]:
def do_cross_val_with_each_X(model):
    start_time = time.time()
    print("tfidf")
    do_cross_val(model, X_tfidf)
    run_time = (time.time() - start_time)
    print(f'time taken: {run_time} seconds\n')
    start_time = time.time()
    print("cv")
    do_cross_val(model, X_cv)
    run_time = (time.time() - start_time)
    print(f'time taken: {run_time} seconds')

In [17]:
lr = make_pipeline(
    StandardScaler(with_mean=False), 
    LogisticRegression(random_state=random_seed, solver='liblinear', C=1))
do_cross_val_with_each_X(lr)

tfidf
Accuracies:
[0.78856049 0.79406486 0.79131267 0.79059471 0.79119301 0.78963743
 0.79056965 0.78554332 0.79212542 0.78314983]
Mean accuracy:
0.7896751392631809
time taken: 12.446320295333862 seconds

cv
Accuracies:
[0.79310757 0.80172311 0.79538112 0.79071437 0.7951418  0.79358622
 0.80026328 0.7898516  0.79499761 0.78745811]
Mean accuracy:
0.7942224790069157
time taken: 13.14556336402893 seconds


In [18]:
mnb = MultinomialNB()
do_cross_val_with_each_X(mnb)

tfidf
Accuracies:
[0.70336245 0.75972239 0.70479837 0.74452555 0.711619   0.74620079
 0.75454763 0.75023935 0.75430828 0.70978937]
Mean accuracy:
0.7339113184711474
time taken: 0.15871286392211914 seconds

cv
Accuracies:
[0.69498624 0.74883331 0.6945076  0.73890152 0.69785808 0.73854254
 0.74652944 0.73719483 0.7434179  0.70045476]
Mean accuracy:
0.7241226229141597
time taken: 0.1645650863647461 seconds


In [19]:
from sklearn.svm import LinearSVC
mnb = LinearSVC()
do_cross_val_with_each_X(mnb)

tfidf
Accuracies:
[0.79406486 0.79765466 0.7939452  0.79238961 0.79669738 0.79203063
 0.79212542 0.78602202 0.79487793 0.78482528]
Mean accuracy:
0.7924632984401848
time taken: 4.039878606796265 seconds

cv




Accuracies:
[0.79478282 0.80220175 0.79442384 0.79131267 0.79550078 0.79179131
 0.79942556 0.79188607 0.79475826 0.78709909]
Mean accuracy:
0.7943182142991548
time taken: 57.48005247116089 seconds
