In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
train = pd.read_table("data/train.tsv")
test = pd.read_table("data/test.tsv")
sample = pd.read_csv("data/sampleSubmission.csv")

In [3]:
train.shape

(156060, 4)

In [4]:
train.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [5]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Dimensionality Reduction
from sklearn.decomposition import TruncatedSVD

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# Viz
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
import string
import collections
 
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

In [9]:
def process_text(text, stem=True):
    text = text.translate(None, string.punctuation)
    tokens = word_tokenize(text)
    
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
        
    return tokens

In [10]:
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)

In [19]:
trainlen = train.shape[0]
train["Phrase"] = train["Phrase"].astype(str) 
train["Phrase"] = train["Phrase"].astype(str).fillna('missing') # FILL NA
train["Phrase"] = train["Phrase"].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
train["Phrase" + '_num_words'] = train["Phrase"].apply(lambda comment: len(comment.split())) # Count number of Words
train["Phrase" + '_num_unique_words'] = train["Phrase"].apply(lambda comment: len(set(w for w in comment.split())))
train["Phrase" + '_words_vs_unique'] = train["Phrase"+'_num_unique_words'] / train["Phrase"+'_num_words'] * 100 # Count Unique Words

In [20]:
train.sample(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_num_words,Phrase_num_unique_words,Phrase_words_vs_unique
123944,123945,6654,realistic ',3,2,2,100.0
106825,106826,5642,"is unsettling , from the preposterous hairpiec...",0,18,17,94.444444
140782,140783,7637,rarely receive it,2,3,3,100.0
111730,111731,5930,"swooping down on a string of exotic locales ,",3,9,9,100.0
22462,22463,1014,for character and viewer,2,4,4,100.0


In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    dtype = np.float32,
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=15000)
# Fit and Transform
word_vectorizer.fit(train.iloc[0:trainlen,:]["Phrase"])
train_word_features = word_vectorizer.transform(train.iloc[0:trainlen,:]["Phrase"])
test_word_features = word_vectorizer.transform(train.iloc[trainlen:,:]["Phrase"])


In [11]:
# Fill Missing Values with 0
df.fillna(0,inplace=True)
print("Missing Values:\n", df.isnull().sum())


Missing Values:
 SentenceId                 0
Phrase                     0
Phrase_num_words           0
Phrase_num_unique_words    0
Phrase_words_vs_unique     0
dtype: int64


In [12]:
dense_vars = [x for x in df.columns if x not in ["PhraseId","SentenceId","Phrase"]]
X = hstack([csr_matrix(df.iloc[0:trainlen,:][dense_vars].values),train_word_features])
test_df = hstack([csr_matrix(df.iloc[trainlen:,:][dense_vars].values),test_word_features])

In [13]:
print("Train Shape: {} Rows and {} Cols".format(*X.shape))
print("Test Shape: {} Rows and {} Cols".format(*test_df.shape))


Train Shape: 156060 Rows and 14991 Cols
Test Shape: 66292 Rows and 14991 Cols


In [14]:
from sklearn.naive_bayes import MultinomialNB

In [15]:
clf = MultinomialNB()
clf.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
submission = clf.predict(test_df)
submission_df = pd.Series(submission).rename("Sentiment")
submission_df.index = testdex
submission_df.to_csv("MNB.csv",index=True,header=True)
submission_df.head()

PhraseId
156061    3
156062    3
156063    2
156064    3
156065    2
Name: Sentiment, dtype: int64

In [17]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
svd.fit(X) 

# Transform
X = svd.transform(X)
test_df = svd.transform(test_df)

In [18]:
clf.fit(X,y)

# Submit
submission = clf.predict(test_df)
submission_df = pd.Series(submission).rename("Sentiment")
submission_df.index = testdex
submission_df.to_csv("TSVD_n_MNB.csv",index=True,header=True)
submission_df.head()

ValueError: Input X must be non-negative

In [19]:
X

array([[ 7.71099325e+01,  3.81280897e+01,  4.83367161e+00,
        -3.17664584e-02, -1.46640077e-02],
       [ 9.42921697e+01,  9.82009357e+00, -4.54761241e-02,
        -2.31502237e-02, -1.63010640e-02],
       [ 9.97887078e+01, -7.08462260e+00,  1.49518002e-01,
        -1.80322722e-02, -6.65336568e-03],
       ...,
       [ 9.97886988e+01, -7.08467310e+00,  1.49391503e-01,
        -1.30846080e-02, -7.31904242e-03],
       [ 9.96487366e+01, -8.48985978e+00,  2.25871760e-01,
        -1.14857029e-02, -5.56379686e-03],
       [ 9.96487366e+01, -8.48985978e+00,  2.25871760e-01,
        -1.14857029e-02, -5.56379686e-03]])