In [9]:
#Hello T-Pain, welcome to Jupyter! This is a fun and interactive way to execute Python code
#it's a great candidate for exploratory steps with data science projects
#click 'Run' at the top to move through the code blocks
print("Hello World, I'm a little code block!")

Hello World, I'm a little code block!


In [5]:
import pandas as pd

DEFAULT_DATA_PATH='./movie-review-sentiment-analysis-kernels-only/'

#let's make a function that loads in our movie review data for us into a Pandas dataframe
#we'll give it two params; the path of the data, and data set (train/test)
def load_review_data(data_path=DEFAULT_DATA_PATH, data_set='train.tsv'):
    
    data_type = data_set.split('.')[1]
    
    #need to figure out what separator is being used so Pandas can open up the data
    if data_type == 'tsv':
        separator = '\t'
    elif data_type =='csv':
        separator = ','
        
    full_data_path = data_path + data_set
    
    #read in the data and return a Pandas DF
    return pd.read_csv(full_data_path, sep=separator)
   

In [4]:
#let's use the handy-dandy function we just made
train_data = load_review_data(data_set='train.tsv')
test_data = load_review_data(data_set='test.tsv')
submission_sample_data = load_review_data(data_set='sampleSubmission.csv')

In [6]:
#awesome, now that we have some data frames, it's gonna be a lot easier to take a peek into our data!
train_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [7]:
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [8]:
submission_sample_data.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [11]:
#Okay, even just doing that,now we have a better sense of what we're working with
#The training data has a bunch of phrases, presumably coming from sentences
#Every phrase is labeled with a phrase ID, a sentence ID, and a sentiment
#The test data is the same but lacks the sentiment label (that's our job!)
#It looks like sentiment is some sort of integer value, but let's try to see what values we're working with
train_data['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [14]:
#Okay, so values range from 0-4..but what which direction is more positive?
#Probably four is, but it's simple enough to confirm

filter_zero = train_data['Sentiment'] == 0
filter_four = train_data['Sentiment'] == 4

zero_reviews = train_data.where(filter_zero).dropna(thresh=4)
four_reviews = train_data.where(filter_four).dropna(thresh=4)

zero_reviews.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
101,102.0,3.0,would have a hard time sitting through this one,0.0
103,104.0,3.0,have a hard time sitting through this one,0.0
157,158.0,5.0,Aggressive self-glorification and a manipulati...,0.0
159,160.0,5.0,self-glorification and a manipulative whitewash,0.0
201,202.0,7.0,Trouble Every Day is a plodding mess .,0.0


In [15]:
four_reviews.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
63,64.0,2.0,"This quiet , introspective and entertaining in...",4.0
66,67.0,2.0,"quiet , introspective and entertaining indepen...",4.0
74,75.0,2.0,entertaining,4.0
77,78.0,2.0,is worth seeking,4.0
117,118.0,4.0,A positively thrilling combination of ethnogra...,4.0


In [17]:
#'plodding mess', 'manipulative whitewash'? 0 is def most negative, and 4 is def most positive
#now that we have a sense of what we're working with, let's set up some baseline components
import numpy as np

#a function that expands the data frame with a sentiment column
#to start, let's just label randomly, but make the probability of selection equal to the percent of scores in the training set
def label_data(data_set,mode='train'):
    
    labeled_data_set = data_set.copy()
    if 'Sentiment' in labeled_data_set.columns:
        labeled_data_set.drop('Sentiment',axis=1)    
    
    for index, review in data_set.iterrows():        
        review['Sentiment'] = select_label(review,mode=mode)
        
    return data_set

#this function chooses a label for a given row
def select_label(review,mode='train'):
    
    sentiment_options = np.asarray([0,1,2,3,4])
    
    p = np.asarray([0.2,0.2,0.2,0.2,0.2])
    
    if mode=='train':
        max_p_index = p.argmax()
        if np.random.rand()<0.95:
            return max_p_index
        else:
            explore_sentiment_options = np.delete(sentiment_options,max_p_index)
            return np.random.choice(explore_sentiment_options)
        
    elif mode=='test':
        return p.argmax()
    


In [18]:
labeled_data = label_data(train_data,mode='train')