In [None]:
import pandas as pd
from pandas import DataFrame
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier

train = pd.read_csv("train1.csv",header=0,usecols=['text','airline_sentiment'])
example1 = train["text"][0]

def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    #review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

##################################################################
clean_review = review_to_words( train["text"][0] )
print clean_review

#################################################################
# Get the number of reviews based on the dataframe column size
num_reviews = train["text"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in xrange( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( review_to_words( train["text"][i] ) )
    
    ##############################################################
    
print "Cleaning and parsing the training set movie reviews...\n"
clean_train_reviews = []
for i in xrange( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_reviews )                                                                    
    clean_train_reviews.append( review_to_words( train["text"][i] ))
    
    
######################################################################

print "Creating the bag of words...\n"

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()


#################################################################################

# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab


#################################################################################

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag
    
    
################################################################################
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["airline_sentiment"] )
# Read the test data
test = pd.read_csv("test1.csv", header=0)
#test = pd.read_csv("b.csv",header=0,usecols=['text','airline_sentiment'])

# Verify that there are 25,000 rows and 2 columns
print test.shape

# Create an empty list and append the clean reviews one by one
num_reviews = test["text"].size
clean_test_reviews = [] 

print "Cleaning and parsing the test set movie reviews...\n"
for i in xrange(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print "Review %d of %d\n" % (i+1, num_reviews)
    clean_review = review_to_words( test["text"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"_unit_id":test["_unit_id"], "airline_sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "result.csv", index=False, quoting=3 )
print ("done final")

virginamerica dhepburn said
Cleaning and parsing the training set movie reviews...

Review 1000 of 11698

Review 2000 of 11698

Review 3000 of 11698

Review 4000 of 11698

Review 5000 of 11698

Review 6000 of 11698

Review 7000 of 11698

Review 8000 of 11698

Review 9000 of 11698

Review 10000 of 11698

Review 11000 of 11698

Creating the bag of words...

44 aa
4 aadvantage
3 aarp
9 abc
3 abcnetwork
4 ability
89 able
4 aboard
6 abq
2 abroad
18 absolute
22 absolutely
11 absurd
5 abt
2 abused
5 abysmal
5 ac
11 accept
16 acceptable
7 accepted
6 accepting
34 access
3 accident
4 accidentally
13 accommodate
2 accommodated
2 accommodating
2 accommodation
5 accommodations
2 accomplished
9 according
3 accordingly
70 account
4 accountability
7 accounts
8 acct
7 accurate
4 acknowledge
11 across
13 act
4 action
6 actions
4 active
3 activities
17 actual
70 actually
4 ad
2 adam
67 add
20 added
17 adding
4 addition
11 additional
33 address
4 addressed
2 addresses
2 addressing
6 adds
2 addtl
6 admiral