In [3]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas as pd
import numpy as np 
import textblob
import string
import xgboost

In [4]:
from keras.preprocessing import sequence, text
from keras import layers, models, optimizers

In [10]:
data = pd.read_csv("train.tsv", sep= "\t")
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [11]:
data["Sentiment"].replace(0, value = "negative", inplace = True)
data["Sentiment"].replace(1, value = "negative", inplace = True)

data["Sentiment"].replace(3, value = "positive", inplace = True)
data["Sentiment"].replace(4, value = "positive", inplace = True)

data = data[(data.Sentiment == "negative") | (data.Sentiment == "positive")]

In [12]:
data.groupby("Sentiment").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,34345,34345,34345
positive,42133,42133,42133


In [13]:
df = pd.DataFrame()
df["text"] = data["Phrase"]
df["label"] = data["Sentiment"]
df.head()

Unnamed: 0,text,label
0,A series of escapades demonstrating the adage ...,negative
21,good for the goose,positive
22,good,positive
33,"the gander , some of which occasionally amuses...",negative
46,amuses,positive


## Text Preprocessing

In [14]:
# Lowercase Uppercase
df["text"] = df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))
# Punctuation
df["text"] = df["text"].str.replace("[^\w\s]", "")
# Numbers
df["text"] = df["text"].str.replace("\d", "")

# Stopwords
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

sw = stopwords.words("english")
df["text"] = df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

# Removing rare elements

rmv = pd.Series(" ".join(df["text"]).split()).value_counts()[-1000:]
df["text"] = df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in rmv))

# Lemmi
from textblob import Word
df["text"] = df["text"].apply(lambda x: " ".join([Word(i).lemmatize() for i in x.split()]))
df["text"].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seneralkan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0     series demonstrating adage good goose also goo...
21                                           good goose
22                                                 good
33    gander occasionally amuses none amount much story
46                                               amuses
Name: text, dtype: object

## Feature Engineering

- Count Vectors
- TF-IDF Vectors (words, characters, n-grams)
- Word Embedding

TF = Each t unit frequency in the document / total text number in the document

IDF = log_e(Total document number / containing t unit for each file )

In [15]:
df.head()

Unnamed: 0,text,label
0,series demonstrating adage good goose also goo...,negative
21,good goose,positive
22,good,positive
33,gander occasionally amuses none amount much story,negative
46,amuses,positive


## Test-Train

In [52]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["text"], df["label"], random_state=42)

In [53]:
train_x

146523    explode obnoxiously screen something bubba hot...
125256    take care cat brings beguiling freshness comin...
38418                                precious little either
130028               girl learns believing something matter
30125                                             also rock
                                ...                        
77749     take really long slow dreary time dope tuck ev...
14192     lrb film rrb work due mostly tongueincheek att...
113404                                 breathtaking mystery
2257                                        directed barely
34668                                                 brisk
Name: text, Length: 57358, dtype: object

In [54]:
## Encoding the label features
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

### Countvectors

In [55]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)

CountVectorizer()

In [56]:
# Transforming the text data to vector
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)

In [57]:
vectorizer.get_feature_names()[0:5]

['aaa', 'aaliyah', 'abagnale', 'abandon', 'abandoned']

In [58]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### TF-IDF

In [59]:
# Word Level
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

TfidfVectorizer()

In [60]:
# Transforming the text data to vector
x_train_tfidf = tf_idf_word_vectorizer.transform(train_x)
x_test_tfidf = tf_idf_word_vectorizer.transform(test_x)

In [61]:
tf_idf_word_vectorizer.get_feature_names()[0:5]

['aaa', 'aaliyah', 'abagnale', 'abandon', 'abandoned']

In [62]:
x_train_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
# N-gram level tf-idf

tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range= (2,3))
tf_idf_ngram_vectorizer.fit(train_x)

TfidfVectorizer(ngram_range=(2, 3))

In [64]:
# Transforming the text data to vector
x_train_tfidf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tfidf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

In [65]:
# Character level tf-idf
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char",ngram_range= (2,3))
tf_idf_chars_vectorizer.fit(train_x)

TfidfVectorizer(analyzer='char', ngram_range=(2, 3))

In [66]:
# Transforming the text data to vector
x_train_tfidf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tfidf_chars = tf_idf_chars_vectorizer.transform(test_x)

# Sentiment Analysis with Machine Learning

## Logistic Regression

In [67]:
log = linear_model.LinearRegression()
log_model = log.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(
    log_model,
    x_test_count,
    test_y,
    cv=10
).mean()

print("Count Vectors Accuracy Score", accuracy)

Count Vectors Accuracy Score -0.07579224448497338


In [68]:
log = linear_model.LinearRegression()
log_model = log.fit(x_train_tfidf, train_y)
accuracy = model_selection.cross_val_score(
    log_model,
    x_test_tfidf,
    test_y,
    cv=10, verbose=2, n_jobs= -1
).mean()

print("Count Vectors Accuracy Score", accuracy)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Count Vectors Accuracy Score 0.1304933861305675


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    1.4s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.6s finished


In [69]:
log = linear_model.LinearRegression()
log_model = log.fit(x_train_tfidf_chars, train_y)
accuracy = model_selection.cross_val_score(
    log_model,
    x_test_tfidf_chars,
    test_y,
    cv=10, verbose=2, n_jobs= -1
).mean()

print("Count Vectors Accuracy Score", accuracy)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   37.4s remaining:  1.5min


Count Vectors Accuracy Score 0.08972328236836549


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   40.1s finished


In [70]:
log = linear_model.LinearRegression()
log_model = log.fit(x_train_tfidf_ngram, train_y)
accuracy = model_selection.cross_val_score(
    log_model,
    x_test_tfidf_ngram,
    test_y,
    cv=10, verbose=2, n_jobs= -1
).mean()

print("Count Vectors Accuracy Score", accuracy)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Count Vectors Accuracy Score 0.3863956404077975


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    2.7s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.8s finished


In [None]:
log_model.predict("yes i like this film")

In [72]:
new_comment = pd.Series("this film is very noce and good i like it")

v = TfidfVectorizer(ngram_range= (2,3))
v.fit(train_x)
new_comment = v.transform(new_comment)

log.predict(new_comment)


array([0.53677441])