In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
train = pd.read_table("data/train.tsv")
test = pd.read_table("data/test.tsv")
sample = pd.read_csv("data/sampleSubmission.csv")

In [3]:
train.shape

(156060, 4)

In [4]:
train.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [5]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Dimensionality Reduction
from sklearn.decomposition import TruncatedSVD

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# Viz
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
df = pd.read_csv("data/train.tsv", sep="\t", index_col = ["PhraseId"])
trainlen = df.shape[0]
test_df = pd.read_csv("data/test.tsv", sep="\t", index_col = ["PhraseId"])
testdex = test_df.index
print("Train Shape: ",df.shape)
print("Test Shape: ",test_df.shape)

y = df.Sentiment.copy()
df = pd.concat([df.drop("Sentiment",axis=1),test_df], axis=0)
print("All Data Shape: {} Rows, {} Columns".format(*df.shape))
del test_df

Train Shape:  (156060, 3)
Test Shape:  (66292, 2)
All Data Shape: 222352 Rows, 2 Columns


In [7]:
print("Dataset Glimpse")
display(df.sample(5))
print("Percent Representation by Sentiment Level")
y.value_counts(normalize=True)*100

Dataset Glimpse


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
115045,6128,of his narrative
38152,1814,Saddled with an unwieldy cast of characters an...
209521,11142,"that has to do with Yvan 's rambunctious , Jew..."
158651,8635,Guard
136219,7364,easy to be bored by as your ABC 's


Percent Representation by Sentiment Level


2    50.994489
3    21.098936
1    17.475971
4     5.899013
0     4.531590
Name: Sentiment, dtype: float64

In [8]:
df["Phrase"] = df["Phrase"].astype(str) 
df["Phrase"] = df["Phrase"].astype(str).fillna('missing') # FILL NA
df["Phrase"] = df["Phrase"].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
df["Phrase" + '_num_words'] = df["Phrase"].apply(lambda comment: len(comment.split())) # Count number of Words
df["Phrase" + '_num_unique_words'] = df["Phrase"].apply(lambda comment: len(set(w for w in comment.split())))
df["Phrase" + '_words_vs_unique'] = df["Phrase"+'_num_unique_words'] / df["Phrase"+'_num_words'] * 100 # Count Unique Words

In [9]:
df.sample(5)

Unnamed: 0_level_0,SentenceId,Phrase,Phrase_num_words,Phrase_num_unique_words,Phrase_words_vs_unique
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
202743,10779,even those viewers who have little patience fo...,10,10,100.0
213493,11370,the get-go,2,2,100.0
54538,2711,leading a double life in an american film only...,15,15,100.0
28724,1331,a semi-autobiographical film,3,3,100.0
164680,8890,dainty psychological terror,3,3,100.0


In [10]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    dtype = np.float32,
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=15000)
# Fit and Transform
word_vectorizer.fit(df.iloc[0:trainlen,:]["Phrase"])
train_word_features = word_vectorizer.transform(df.iloc[0:trainlen,:]["Phrase"])
test_word_features = word_vectorizer.transform(df.iloc[trainlen:,:]["Phrase"])


In [11]:
# Fill Missing Values with 0
df.fillna(0,inplace=True)
print("Missing Values:\n", df.isnull().sum())


Missing Values:
 SentenceId                 0
Phrase                     0
Phrase_num_words           0
Phrase_num_unique_words    0
Phrase_words_vs_unique     0
dtype: int64


In [12]:
dense_vars = [x for x in df.columns if x not in ["PhraseId","SentenceId","Phrase"]]
X = hstack([csr_matrix(df.iloc[0:trainlen,:][dense_vars].values),train_word_features])
test_df = hstack([csr_matrix(df.iloc[trainlen:,:][dense_vars].values),test_word_features])

In [13]:
print("Train Shape: {} Rows and {} Cols".format(*X.shape))
print("Test Shape: {} Rows and {} Cols".format(*test_df.shape))


Train Shape: 156060 Rows and 14991 Cols
Test Shape: 66292 Rows and 14991 Cols


In [14]:
from sklearn.naive_bayes import MultinomialNB

In [15]:
clf = MultinomialNB()
clf.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
submission = clf.predict(test_df)
submission_df = pd.Series(submission).rename("Sentiment")
submission_df.index = testdex
submission_df.to_csv("MNB.csv",index=True,header=True)
submission_df.head()

PhraseId
156061    3
156062    3
156063    2
156064    3
156065    2
Name: Sentiment, dtype: int64

In [17]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
svd.fit(X) 

# Transform
X = svd.transform(X)
test_df = svd.transform(test_df)

In [18]:
clf.fit(X,y)

# Submit
submission = clf.predict(test_df)
submission_df = pd.Series(submission).rename("Sentiment")
submission_df.index = testdex
submission_df.to_csv("TSVD_n_MNB.csv",index=True,header=True)
submission_df.head()

ValueError: Input X must be non-negative

In [19]:
X

array([[ 7.71099325e+01,  3.81280897e+01,  4.83367161e+00,
        -3.17664584e-02, -1.46640077e-02],
       [ 9.42921697e+01,  9.82009357e+00, -4.54761241e-02,
        -2.31502237e-02, -1.63010640e-02],
       [ 9.97887078e+01, -7.08462260e+00,  1.49518002e-01,
        -1.80322722e-02, -6.65336568e-03],
       ...,
       [ 9.97886988e+01, -7.08467310e+00,  1.49391503e-01,
        -1.30846080e-02, -7.31904242e-03],
       [ 9.96487366e+01, -8.48985978e+00,  2.25871760e-01,
        -1.14857029e-02, -5.56379686e-03],
       [ 9.96487366e+01, -8.48985978e+00,  2.25871760e-01,
        -1.14857029e-02, -5.56379686e-03]])