In [2]:
import re
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.util import ngrams

In [None]:
text = ''

# Search,findall,finditer

In [None]:
match = re.search("someword", text)
match = re.findall('someword',text)
match = re.finditer('someword',text)

# The following regex will find all whole words start with capital and continuou with lowercase letter

In [None]:
re.findall("[A-Z][a-z]+", text)   #if we have multiline text we need to input re.MULTILINE as an arument inside re.findall()

# Tokenizers

In [None]:
# we can define tokenizer according to the text we have by ourselves
tokenizer = RegexpTokenizer('[A-Z]\w+')
# we also can use predefined tukenizer as follows
#BlanklineTokenizer - Tokenize a string using blank lines as delimiter.
#WordPunctTokenizer - Tokenize a string into alphabetic and non-alphabetic characters.
#WhitespaceTokenizer- Tokenize a string using spaces, tabs and newlines as delimiters.
from nltk.tokenize import BlanklineTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import WhitespaceTokenizer
tokenizer = BlanklineTokenizer
#Then we use it to split out text
tokens = tokenizer.tokenize(text)


# Stemmer

In [None]:
nltk.download('stopwords')
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stems = [list(map(stemmer.stem, words))]

# Ngrams

In [None]:
#After tokenization we can use Ngrams.(words is the list of tokenize text or list of words)
list(ngrams(words, 1))
list(ngrams(words, 2))
list(ngrams(words, 3))

# Read the data

In [None]:
#Load the dataset
df = pd.read_csv('./data/imdb_sentiment.csv')

# Get the text
docs = df['text']

# Split in train and validation
train_df, validation_df = train_test_split(df, test_size=0.2, random_state=42)

###we can read from file and represent it as list as follows
#def file_to_list(file_name):
    #with open(file_name, 'r', encoding='utf-8') as f:
        #return [line.strip() for line in f.readlines()]
    
#X_train_pre = file_to_list('data/tweets_train_preprocessed.txt')
#X_dev_pre = file_to_list('data/tweets_dev_preprocessed.txt')
#X_test_pre = file_to_list('data/tweets_test_preprocessed.txt')

# Cleaning the data

In [None]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list,
                 lower=True, remove_punct=True):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Stem words
        if self.stemmer:
            words = map(self.stemmer.stem, words)

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self

In [62]:
#Cleaning the data using above class

In [None]:
# Initialize a tokenizer and a stemmer
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
regex_list = [("<[^>]*>", "")
             ]

#cleaner = TextCleanerTransformer(tokenizer, stemmer, regex_list)
#docs = cleaner.transform(train_df.text.values)

# Encoding the sentiment coulmns or target to 0 and 1

In [None]:
# Encode the labels
le = preprocessing.LabelEncoder()
le.fit(train_df['sentiment'].values)

train_df['sentiment'] = le.transform(train_df['sentiment'].values)
validation_df['sentiment'] = le.transform(validation_df['sentiment'].values)

# Define pipline and predict

In [None]:
# Build the pipeline
text_clf = Pipeline([('prep', TextCleanerTransformer(tokenizer, stemmer, regex_list)),
                   ('vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB())])
# Train the classifier
text_clf.fit(map(str, train_df['text'].values), train_df['sentiment'].values)

predicted = text_clf.predict(map(str, validation_df['text'].values))
np.mean(predicted == validation_df['sentiment'])

# Check classification result

In [None]:
# check the results
print(classification_report(y_dev, y_dev_pred))

# Bag of Word(BOW) model using sklearn's CountVectorizer.

In [None]:
# fit and transform the preprocessed train and dev data with CountVectorizer
text_cleaner = TextCleanerTransformer(
    tokenizer=tokenizer, 
    stemmer=stemmer,
    lower=True, 
    remove_punct=True, 
    stopwords=stopword_list
)

X_train_pre = text_cleaner.clean_sentences(X_train)
X_dev_pre = text_cleaner.clean_sentences(X_dev)
#Encode the lables y_train and y_dev
vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_train_pre)
X_dev_vec = vec.transform(X_dev_pre)

le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
#le.fit(y_dev)
y_dev = le.transform(y_dev)
clf = MultinomialNB()
clf.fit(X_train_vec,y_train)
# predict 0 and 1
y_dev_pred= clf.predict(X_dev_vec)
#positive negative
print(classification_report(y_dev, y_dev_pred))

    
    

# Bag of Word(BOW) model using sklearn's CountVectorizer but using function train and validate the input are preprocessed data!

In [1]:
def train_and_validate(X_train, X_dev, y_train, y_dev, ngram_range=(1,1), max_features=None):
    """
    Train a model using sklearn's Pipeline and return it along with the predictions and the
    current accuracy in the validation set. Print the classification report as well.
    Assume the documents are already preprocessed
    
    Args:
    X_train - preprocessed tweets in training data
    X_dev - preprocessed tweets in dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    """
    
    # Build the pipeline containing the countvectorizer and the multinomial NB classifier
    # text_clf = Pipeline(...)
    
    # Train the classifier
    # (...)

    # y_dev_pred = (...)
    # print the classification report
    # acc = (...)
    
    # YOUR CODE HERE
    
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=ngram_range, max_features= max_features)),
                   ('clf', MultinomialNB())])
# Train the classifier
    text_clf.fit( X_train, y_train)

    y_dev_pred = text_clf.predict( X_dev)
#np.mean(predicted == validation_df['sentiment'])
    #raise NotImplementedError()
    #acc = classification_report(y_dev, y_dev_pred)
    acc = np.mean(y_dev_pred == y_dev)
    #print(classification_report(y_dev, y_dev_pred))
    y_dev_predn=[]
    for num in y_dev_pred:
        if num ==1:
            pre = 'positive'
        else:
            pre = 'negative'
        y_dev_predn.append(pre)   
    y_dev_pred = y_dev_predn
    
    return text_clf,y_dev_pred,acc
    # YOUR CODE HERE
    #raise NotImplementedError()