In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from os.path import join
from bs4 import BeautifulSoup

In [3]:
dfTrain = pd.read_csv('labeledTrainData.tsv',header=0,delimiter="\t",quoting=3)

dfTest = pd.read_csv('testData.tsv', header=0,delimiter="\t", quoting=3 )

In [4]:
dfTrain.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
dfTest.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [6]:
dfTrain['review'][11]

'"Although I generally do not like remakes believing that remakes are waste of time; this film is an exception. I didn\'t actually know so far until reading the previous comment that this was a remake, so my opinion is purely about the actual film and not a comparison.<br /><br />The story and the way it is written is no question: it is Capote. There is no need for more words.<br /><br />The play of Anthony Edwards and Eric Roberts is superb. I have seen some movies with them, each in one or the other. I was certain that they are good actors and in case of Eric I always wondered why his sister is the number 1 famous star and not her brother. This time this certainty is raised to fact, no question. His play, just as well as the play of Mr. Edwards is clearly the top of all their profession.<br /><br />I recommend this film to be on your top 50 films to see and keep on your DVD shelves."'

In [7]:
target=dfTrain['sentiment']

In [8]:
def review_to_wordlist(review, remove_stopwords=False, split=False):
    """
    Simple text cleaning function, 
    uses BeautifulSoup to extract text content from html
    removes all non-alphabet
    converts to lower case
    can remove stopwords
    can perform simple tokenization using split by whitespace
    """
        
    review_text = BeautifulSoup(review, 'lxml').get_text()
    
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    if split:      
        return(words)
    else:
        return(' '.join(words))

In [9]:
review_to_wordlist(dfTrain['review'][11])

'although i generally do not like remakes believing that remakes are waste of time this film is an exception i didn t actually know so far until reading the previous comment that this was a remake so my opinion is purely about the actual film and not a comparison the story and the way it is written is no question it is capote there is no need for more words the play of anthony edwards and eric roberts is superb i have seen some movies with them each in one or the other i was certain that they are good actors and in case of eric i always wondered why his sister is the number famous star and not her brother this time this certainty is raised to fact no question his play just as well as the play of mr edwards is clearly the top of all their profession i recommend this film to be on your top films to see and keep on your dvd shelves'

In [10]:
dfTrain['review'] =  dfTrain['review'].map(review_to_wordlist)
dfTest['review'] =  dfTest['review'].map(review_to_wordlist)
train_len = len(dfTrain)

In [11]:
train_len

25000

In [12]:
corpus = list(dfTrain['review']) + list(dfTest['review'])

In [13]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, ngram_range=(1, 2),\
                      use_idf=True,smooth_idf=True,sublinear_tf=True,\
                      stop_words = 'english')

tfv.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
X_all = tfv.transform(corpus)


In [15]:
train = X_all[:train_len]
test = X_all[train_len:]

In [17]:
Cs = [1,3,10,30,100,300]
for c in Cs:
    clf = LogisticRegression(penalty='l2', dual=True, tol=0.0001,\
                         C=c, fit_intercept=True, intercept_scaling=1.0,\
                         class_weight=None, random_state=None)
                         
    print("c:",c,"   score:", np.mean(cross_val_score(clf, train, target,\
                            cv=5, scoring='roc_auc')))

c: 1    score: 0.956979296
c: 3    score: 0.961366144
c: 10    score: 0.96299312
c: 30    score: 0.963238848
c: 100    score: 0.963157504
c: 300    score: 0.963005408


In [18]:
clf = LogisticRegression(penalty='l2', dual=True, tol=0.0001,\
                         C=30, fit_intercept=True, intercept_scaling=1.0,\
                         class_weight=None, random_state=None)

clf.fit(train,target)

LogisticRegression(C=30, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1.0, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
preds = clf.predict_proba(test)[:,1]
dfOut = pd.DataFrame( data={"id":dfTest["id"], "sentiment":preds} )

In [20]:
preds

array([ 0.99538918,  0.00872768,  0.47967866, ...,  0.38155696,
        0.97988388,  0.70754227])

In [21]:
dfOut.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",0.995389
1,"""8348_2""",0.008728
2,"""5828_4""",0.479679
3,"""7186_2""",0.766476
4,"""12128_7""",0.964428
