In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from nltk.corpus import movie_reviews, stopwords
from collections import Counter

In [161]:
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
movie_reviews.fileids('neg')[0:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [3]:
movie_reviews.fileids('pos')[0:5]

['pos/cv000_29590.txt',
 'pos/cv001_18431.txt',
 'pos/cv002_15918.txt',
 'pos/cv003_11664.txt',
 'pos/cv004_11636.txt']

In [8]:
# pick 2 files, one from positive review and one from negative review
neg_rev_list = []
for i in movie_reviews.fileids('neg')[0:5]:
    #print(i)
    neg_rev_list.append(movie_reviews.raw(i))
all_neg_revs = ' '.join(neg_rev_list)

pos_rev_list = []
for i in movie_reviews.fileids('pos')[0:5]:
    #print(i)
    pos_rev_list.append(movie_reviews.raw(i))
all_pos_revs = ' '.join(pos_rev_list)

In [10]:
len(all_neg_revs)

15612

In [11]:
len(all_pos_revs)

20738

In [12]:
all_neg_revs[0:500]

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt'

In [13]:
all_pos_revs[0:500]

"films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject"

In [17]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    
    text = ' '.join(text.split())
    text = re.sub(r' . . . ','. ',text)
    return text

In [18]:
neg_rev_clean = text_cleaner(all_neg_revs)
neg_rev_all_clean = neg_rev_clean.replace("\\",'')

pos_rev_clean = text_cleaner(all_pos_revs)
pos_rev_all_clean = pos_rev_clean.replace("\\",'')

In [19]:
all_neg_revs[0:500]

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt'

In [20]:
print(neg_rev_all_clean[0:500])

plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what's the deal ? watch the movie and " sorta " find out. critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break t


In [21]:
all_pos_revs[0:500]

"films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject"

In [22]:
print(pos_rev_all_clean[0:500])

films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . to say moore and campbell thoroughly researched the subject o


In [23]:
# Parse using SpaCy
nlp = spacy.load('en')
neg_rev_doc = nlp(neg_rev_all_clean)
pos_rev_doc = nlp(pos_rev_all_clean)


In [24]:
#Group into sentences
neg_sents = [ [sent,'Negative'] for sent in neg_rev_doc.sents]
pos_sents = [[sent, 'Positive'] for sent in pos_rev_doc.sents]


sentences_df = pd.DataFrame(neg_sents + pos_sents)
sentences_df.head()

Unnamed: 0,0,1
0,"(plot, :, two, teen, couples, go, to, a, churc...",Negative
1,"(they, get, into, an, accident, .)",Negative
2,"(one, of, the, guys, dies, ,, but, his, girlfr...",Negative
3,"(what, 's, the, deal, ?)",Negative
4,"(watch, the, movie, and, "", sorta, "")",Negative


In [25]:
sentences_df.shape

(295, 2)

# Bag of Words Features

In [37]:
# Create bag of words function for each text
def bag_of_words(text, most_common_count):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    print('allwords count', len(allwords))
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(most_common_count)]



In [46]:
# Get bags 
neg_words = bag_of_words(neg_rev_doc, 500)

pos_words = bag_of_words(pos_rev_doc, 500)

# Combine bags to create common set of unique words
common_words = set(neg_words + pos_words)

allwords count 2716
allwords count 3511


In [39]:
len(neg_words)

500

In [40]:
len(pos_words)

500

In [198]:
# Create bag of words data frame using combined common words and sentences
def bow_features(sentences, common_words):
    
    # Build data frame
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentences in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentences
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
    
    return df

In [48]:
# Create bow features 
reviews = bow_features(sentences_df, common_words)
reviews.head()

Unnamed: 0,craziness,dig,really,20,reformed,against,dragon,either,chan,type,...,opening,student,base,disappearance,i,call,thing,adequate,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(plot, :, two, teen, couples, go, to, a, churc...",Negative
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(they, get, into, an, accident, .)",Negative
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(one, of, the, guys, dies, ,, but, his, girlfr...",Negative
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(what, 's, the, deal, ?)",Negative
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(watch, the, movie, and, "", sorta, "")",Negative


In [49]:
reviews.shape

(295, 827)

# TF-IDF Features

In [103]:
neg_sents_list = []
pos_sents_list = []
all_sents_list = []
for i in movie_reviews.fileids('neg')[0:5]:
    
    neg_rev_sents = movie_reviews.sents(i)
    neg_sents_list.append([ " ".join(sent) for sent in neg_rev_sents]    )

for i in movie_reviews.fileids('pos')[0:5]:
    
    pos_rev_sents = movie_reviews.sents(i)
    pos_sents_list.append([ " ".join(sent) for sent in pos_rev_sents]    )

neg_pos_sent = neg_sents_list + pos_sents_list

for sublist in neg_pos_sent:
    for item in sublist:
        all_sents_list.append(item)  




In [120]:
len(all_sents_list)

305

In [105]:
from sklearn.model_selection import train_test_split
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=2, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

all_sents_tfidf = vectorizer.fit_transform(all_sents_list)

In [109]:
all_sents_tfidf.get_shape()

(305, 538)

In [116]:
all_sents_tfidf

<305x538 sparse matrix of type '<class 'numpy.float64'>'
	with 1722 stored elements in Compressed Sparse Row format>

In [119]:
neg_cnt = 0
for sublist in neg_sents_list:
    for item in sublist:
        neg_cnt += 1
print(neg_cnt)   
pos_cnt = 0
for sublist in pos_sents_list:
    for item in sublist:
        pos_cnt += 1
print(pos_cnt) 

144
161


In [121]:
from sklearn.model_selection import cross_val_score

# Specify model inputs for each feature set

# BoW
X_bow = reviews.drop(['text_sentence', 'text_source'], 1)
Y_bow = reviews['text_source']

# Tfidf
X_tfidf = all_sents_tfidf
Y_tfidf = ['Negative'] * neg_cnt + ['Positive'] * pos_cnt

In [122]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf,y_train_tfidf, y_test_tfidf= train_test_split(X_tfidf,Y_tfidf, test_size=0.2, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()
#number of sentences
n = X_train_tfidf_csr.shape[0]

In [123]:
X_train_bow, X_test_bow,y_train_bow, y_test_bow= train_test_split(X_bow,Y_bow, test_size=0.2, random_state=0)

In [124]:
print(X_train_tfidf.shape)

(244, 538)


In [125]:
print(X_test_tfidf.shape)

(61, 538)


# Supervised Learning Models

In [126]:
from sklearn.linear_model import LogisticRegression
print('Logistic Regression')
# BoW
lr = LogisticRegression()
lr_bow = lr.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(lr_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(lr_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(lr_bow, X_test_bow, y_test_bow, cv=5)))

# Tfidf
lr = LogisticRegression()
lr_tfidf = lr.fit(X_train_tfidf, y_train_tfidf)
print('\nTfidf : \n', cross_val_score(lr_tfidf, X_train_tfidf, y_train_tfidf, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(lr_tfidf, X_train_tfidf, y_train_tfidf, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(lr_tfidf, X_test_tfidf, y_test_tfidf, cv=5)))

Logistic Regression
BoW : 
  [ 0.6875      0.59574468  0.82978723  0.72340426  0.68085106]
Training Data Avg Score: 0.703457446809
Test Data Avg Score: 0.628904428904

Tfidf : 
 [ 0.82        0.68        0.70833333  0.60416667  0.6875    ]
Training Data Avg Score: 0.7
Test Data Avg Score: 0.618065268065


In [127]:
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()

print('Random Forest Classifier')
# BoW

rfc_bow = rfc.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(rfc_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(rfc_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(rfc_bow, X_test_bow, y_test_bow, cv=5)))

# Tfidf

rfc_tfidf = rfc.fit(X_train_tfidf, y_train_tfidf)
print('\nTfidf : \n', cross_val_score(rfc_tfidf, X_train_tfidf, y_train_tfidf, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(rfc_tfidf, X_train_tfidf, y_train_tfidf, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(rfc_tfidf, X_test_tfidf, y_test_tfidf, cv=5)))

Random Forest Classifier
BoW : 
  [ 0.54166667  0.68085106  0.57446809  0.59574468  0.68085106]
Training Data Avg Score: 0.703280141844
Test Data Avg Score: 0.669813519814

Tfidf : 
 [ 0.66        0.72        0.625       0.58333333  0.70833333]
Training Data Avg Score: 0.7005
Test Data Avg Score: 0.570396270396


In [128]:
gbc = ensemble.GradientBoostingClassifier()
print('Gradient Boosting Classifier')
# BoW

gbc_bow = gbc.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(gbc_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(gbc_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(gbc_bow, X_test_bow, y_test_bow, cv=5)))

# Tfidf

gbc_tfidf = gbc.fit(X_train_tfidf, y_train_tfidf)
print('\nTfidf : \n', cross_val_score(gbc_tfidf, X_train_tfidf, y_train_tfidf, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(gbc_tfidf, X_train_tfidf, y_train_tfidf, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(gbc_tfidf, X_test_tfidf, y_test_tfidf, cv=5)))

Gradient Boosting Classifier
BoW : 
  [ 0.70833333  0.68085106  0.80851064  0.65957447  0.68085106]
Training Data Avg Score: 0.711879432624
Test Data Avg Score: 0.595337995338

Tfidf : 
 [ 0.76        0.62        0.625       0.64583333  0.70833333]
Training Data Avg Score: 0.659333333333
Test Data Avg Score: 0.52296037296


# try to increase accuracy

In [199]:
# Get bags 
neg_words = bag_of_words(neg_rev_doc, 2500)

pos_words = bag_of_words(pos_rev_doc, 2500)

# Combine bags to create common set of unique words
common_words = set(neg_words + pos_words)

allwords count 2716
allwords count 3511


In [200]:
# Create bow features 
reviews = bow_features(sentences_df, common_words)
reviews.head()

Unnamed: 0,dig,against,indiglo,ark,ditzy,infamous,recycle,academy,bar,duddy,...,category,balki,center,000,spending,modern,call,thing,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(plot, :, two, teen, couples, go, to, a, churc...",Negative
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(they, get, into, an, accident, .)",Negative
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(one, of, the, guys, dies, ,, but, his, girlfr...",Negative
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(what, 's, the, deal, ?)",Negative
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(watch, the, movie, and, "", sorta, "")",Negative


In [201]:
def entityty_types(df):
    
    person_ent_type = []
    qty_ent_type = []
    ordinal_ent_type = []
    time_ent_type = []
    org_ent_type = []
    lang_ent_type = []
    date_ent_type = []
    card_ent_type = []
    gpe_ent_type = []
    fac_ent_type = []
    for i, sentence in enumerate(df['text_sentence']):
        person_count = 0
        qty_count= 0
        ordinal_count = 0
        time_count = 0
        org_count = 0
        lang_count = 0
        date_count= 0
        cardinal_count =0 
        gpe_count= 0
        fac_count = 0
    
        for token in sentence:
            if token.ent_type_ == 'PERSON':
                person_count += 1
        
            if token.ent_type_ == 'QUANTITY':
                qty_count += 1
            
            if token.ent_type_ == 'ORDINAL':
                ordinal_count += 1
            
            if token.ent_type_ == 'TIME':
                time_count += 1
            
            if token.ent_type_ == 'ORG':
                org_count += 1
            
            if token.ent_type_ == 'LANGUAGE':
                lang_count += 1
            if token.ent_type_ == 'DATE':
                date_count += 1            
        
            if token.ent_type_ == 'CARDINAL':
                cardinal_count += 1            
            if token.ent_type_ == 'GPE':
                gpe_count += 1            
            if token.ent_type_ == 'FAC':
                fac_count += 1            
            
        person_ent_type.append(person_count)
        qty_ent_type.append(qty_count)
        ordinal_ent_type.append(ordinal_count)
        time_ent_type.append(time_count)
        org_ent_type.append(org_count)
        lang_ent_type.append(lang_count)
        date_ent_type.append(date_count)
        card_ent_type.append(cardinal_count)
        gpe_ent_type.append(gpe_count)
        fac_ent_type.append(fac_count)

          
    df['person_ent'] = person_ent_type
    df['qty_ent'] = qty_ent_type
    df['ordinal_ent'] = ordinal_ent_type
    df['time_ent'] = time_ent_type
    df['org_ent'] = org_ent_type
    df['lang_ent'] = lang_ent_type
    df['date_ent'] = date_ent_type
    df['card_ent'] = card_ent_type
    df['gpe_ent'] = gpe_ent_type
    df['fac_ent'] = fac_ent_type
    return(df)

In [202]:
reviews = entityty_types(reviews)

In [208]:
reviews.head()

Unnamed: 0,dig,against,indiglo,ark,ditzy,infamous,recycle,academy,bar,duddy,...,person_ent,qty_ent,ordinal_ent,time_ent,org_ent,lang_ent,date_ent,card_ent,gpe_ent,fac_ent
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [204]:
X_bow = reviews.drop(['text_sentence', 'text_source'], 1)
Y_bow = reviews['text_source']
X_train_bow, X_test_bow,y_train_bow, y_test_bow= train_test_split(X_bow,Y_bow, test_size=0.2, random_state=0)

In [205]:
from sklearn.linear_model import LogisticRegression
print('Logistic Regression')
# BoW
lr = LogisticRegression()
lr_bow = lr.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(lr_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(lr_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(lr_bow, X_test_bow, y_test_bow, cv=5)))



Logistic Regression
BoW : 
  [ 0.66666667  0.61702128  0.85106383  0.70212766  0.70212766]
Training Data Avg Score: 0.70780141844
Test Data Avg Score: 0.697086247086


In [206]:
parameters =[ {'C': [0.01, 0.1, 1, 10, 100],'solver':['liblinear'],'penalty':['l1', 'l2'],'fit_intercept':[True]},
            {'C': [0.01, 0.1, 1, 10, 100],'solver':['lbfgs','newton-cg'],'fit_intercept':[True]}
            ]

gr_logr = GridSearchCV(lr,param_grid = parameters )
gr_logr.fit(X_train_bow, y_train_bow)
print('Best Parameter ', gr_logr.best_params_)

Best Parameter  {'C': 10, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'liblinear'}


In [207]:
lr = LogisticRegression(**gr_logr.best_params_, random_state = 10)
lr.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(lr_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(lr_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(lr_bow, X_test_bow, y_test_bow, cv=5)))

BoW : 
  [ 0.66666667  0.61702128  0.85106383  0.70212766  0.70212766]
Training Data Avg Score: 0.70780141844
Test Data Avg Score: 0.697086247086


# Though the training data score remained same, the test data score increased from 63 % to 70% .

In [215]:
print('Random Forest Classifier')
# BoW

rfc_bow = rfc.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(rfc_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(rfc_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(rfc_bow, X_test_bow, y_test_bow, cv=5)))


Random Forest Classifier
BoW : 
  [ 0.64583333  0.59574468  0.59574468  0.55319149  0.65957447]
Training Data Avg Score: 0.635372340426
Test Data Avg Score: 0.627389277389


In [229]:
rfc_params  = {
    'n_estimators':[100,200,500],
    'max_features':['auto', 'sqrt', 'log2'],
    'max_depth':[4, 6,7, 8, None],
    'min_samples_split':[2, 8]
}
rfc_grid = GridSearchCV(ensemble.RandomForestClassifier(random_state=10), param_grid=rfc_params)
rfc_grid.fit(X_train_bow, y_train_bow)

print(rfc_grid.best_score_)
print(rfc_grid.best_params_)

0.690677966102
{'max_depth': None, 'max_features': 'log2', 'min_samples_split': 8, 'n_estimators': 200}


In [233]:
print('Random Forest Classifier')
# BoW
rfc = ensemble.RandomForestClassifier(**rfc_grid.best_params_)
rfc_bow = rfc.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(rfc_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(rfc_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(rfc_bow, X_test_bow, y_test_bow, cv=5)))


Random Forest Classifier
BoW : 
  [ 0.60416667  0.68085106  0.68085106  0.74468085  0.63829787]
Training Data Avg Score: 0.712056737589
Test Data Avg Score: 0.697086247086


here we got 3% increase in the test data score.

In [241]:
print('Random Forest Classifier')
# BoW
rfc = ensemble.RandomForestClassifier(**rfc_grid.best_params_)
rfc_bow = rfc.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(rfc_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(rfc_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(rfc_bow, X_test_bow, y_test_bow, cv=5)))


Random Forest Classifier
BoW : 
  [ 0.64583333  0.65957447  0.70212766  0.72340426  0.72340426]
Training Data Avg Score: 0.678102836879
Test Data Avg Score: 0.647086247086


In [242]:
gbc = ensemble.GradientBoostingClassifier()
print('Gradient Boosting Classifier')
# BoW

gbc_bow = gbc.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(gbc_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(gbc_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(gbc_bow, X_test_bow, y_test_bow, cv=5)))


Gradient Boosting Classifier
BoW : 
  [ 0.66666667  0.70212766  0.80851064  0.70212766  0.59574468]
Training Data Avg Score: 0.690780141844
Test Data Avg Score: 0.612004662005


In [254]:

param_test1 = {'n_estimators':[100,200,300,500],
               'learning_rate': [0.03, 0.1, 0.3],
               'max_depth':[2,4,None],               
               'loss': ['deviance'],
               'subsample':[0.8,0.5, 1]              
              }
 

gsearch1 = GridSearchCV(
    estimator = ensemble.GradientBoostingClassifier(random_state=10), 
    param_grid = param_test1)
gsearch1.fit(X_train_bow, y_train_bow)

print('\nBest parameters {} '.format(gsearch1.best_params_))



Best parameters {'learning_rate': 0.3, 'loss': 'deviance', 'max_depth': 4, 'n_estimators': 300, 'subsample': 1} 


In [255]:
gbc = ensemble.GradientBoostingClassifier(**gsearch1.best_params_)
print('Gradient Boosting Classifier')
# BoW

gbc_bow = gbc.fit(X_train_bow, y_train_bow)
print('BoW : \n ', cross_val_score(gbc_bow, X_train_bow, y_train_bow, cv=5))
print('Training Data Avg Score:', np.mean(cross_val_score(gbc_bow, X_train_bow, y_train_bow, cv=5)))
print('Test Data Avg Score:', np.mean(cross_val_score(gbc_bow, X_test_bow, y_test_bow, cv=5)))

Gradient Boosting Classifier
BoW : 
  [ 0.64583333  0.68085106  0.80851064  0.68085106  0.65957447]
Training Data Avg Score: 0.703723404255
Test Data Avg Score: 0.626107226107


We see 4% improvement here.

# Conclusion

The bag of words model performed better than the tfidf model.
For bag of words model, when using Logistic Regression, the training data score remained same but the test data score increased from 63 % to 70% . So here we saw a 7%  increase in the accuracy, where as for Gradient Boosting Classifier, we saw 4% improvement and for Random Forest Classifier, we got 3% increase in the test data score.

