In [44]:
import nltk
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
fname_all_anecdotal_sents = "/home/shirish/BTECHSEM2/project/books/stories/new_complete_stories.txt"
fname_all_non_anecdotal_sents = "/home/shirish/BTECHSEM2/project/books/annotated_books/stories_without_anecdotes/new_all_stories.txt2"
fname_all_stories_sents = "/home/shirish/BTECHSEM2/project/books/combined_book/all_stories.txt"
fname_buffet_sents = "/home/shirish/BTECHSEM2/project/books/buffet_en.txt"
imp_cols = ["NNP","VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

In [46]:
def tokenize(fname):
    """
    @param fname = filename
    Returns = list of sentences, where each sentence is a list of POS tagged words
    """
    f = open(fname, "r")
    taggedSents = []
    s = f.read()
    s = s.lower()
    s = unicode(s, errors="ignore")
    sentences = nltk.sent_tokenize(s)
    for i in sentences:
        taggedSents.append(nltk.pos_tag(nltk.word_tokenize(i)))
    return taggedSents

In [47]:
def featureset_df(taggedSents, value, imp_cols):
    """
    This function returns a dataframe consisting of the imp_cols and target column after
    removing the NaN values
    @param taggedSents = list of sentences, where each sentence is a list of POS tagged words
    @param value = Value given to the target column in the dataframe
    Returns = dataframe with columns as imp_cols and target with value @param value
    """
    sents1 = []
    for i in taggedSents:
        l = {}
        for j in i:
            if j[1].isalpha():
                # If Noun
                if j[1][0] == "NNP":# or j[1] == "PRP":
                    l["NNP"] = 1
                else:
                    l[j[1]] = 1
        sents1.append(l)
    df = pd.DataFrame(sents1)
    df.fillna(0, inplace=True)
    features = df[imp_cols]
    features["target"] = value
    return features

In [48]:
def tagSentences():
    """
    This function returns two tagged sentences list
    ie List of non_anecdotal , list of anecdotal sentences
    """
    taggedSents1 = tokenize(
    "/home/shirish/BTECHSEM2/project/books/stories/new_complete_stories.txt")
    taggedSents0 = tokenize(
    "/home/shirish/BTECHSEM2/project/books/annotated_books/stories_without_anecdotes/new_all_stories.txt2")
    return taggedSents0, taggedSents1
    

In [49]:
def create_feature_set_df(taggedSents0, taggedSents1):
    """
    Call this function after calling tagSentences()
    This function returns the features dataframe
    """
    features0 = featureset_df(taggedSents0, 0, imp_cols)
    if(taggedSents1 is not None):
        features1 = featureset_df(taggedSents1, 1, imp_cols)
        features = pd.concat([features1, features0])
    else:
        features = features0
    return features
    

In [50]:
def model_train(model, X_train, y_train):
    model = MultinomialNB()
    model = model.fit(X_train, y_train)
    return model

In [51]:
TS0, TS1 = tagSentences()
df = create_feature_set_df(TS0, TS1)
df = shuffle(df)
X_train, X_test, y_train, y_test = train_test_split(df[imp_cols], df["target"], test_size=0.25)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [52]:

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [53]:
df.columns


Index([u'NNP', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ', u'target'], dtype='object')

In [58]:
model = Sequential()

In [59]:
model.add(Dense(100, input_dim=7, init='uniform', activation='sigmoid'))
model.add(Dense(50, init='uniform', activation='sigmoid'))
model.add(Dense(40, init='uniform', activation='sigmoid'))
model.add(Dense(20, init='uniform', activation='sigmoid'))
model.add(Dense(8, init='uniform', activation='sigmoid'))
model.add(Dense(1, init='uniform', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


In [60]:
model.fit(df[imp_cols], df["target"], nb_epoch=100, batch_size=16)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7ff2cb5f8ed0>

In [36]:
df.head(1)

Unnamed: 0,NNP,VB,VBD,VBG,VBN,VBP,VBZ,target
804,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0


In [43]:
df["target"].value_counts()

0    825
1    603
Name: target, dtype: int64