In [29]:
#!/usr/bin/env python
# coding: utf-8

# Importing libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import re, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Reading in the dataset
data = pd.read_csv('data/hm_train.csv')
submission = pd.read_csv('data/hm_test.csv')

# Defining a function to lemmatize tokens
def lemmatize_sentence(tokens: list) -> str:
    """
    Returns lemmatized sentence in exchange for original sentence tokens
    """
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def stem_sentence(tokens: list) -> str:
    """
    Returns a stemmed sentence in exchange for original sentence tokens
    """
    # Instantiating a PorterStemmer object
    porter = PorterStemmer()
    stem_message=[]
    for token in tokens:
        stem_message.append(porter.stem(token))
        stem_message.append(" ")
    return ''.join(stem_message)

# Let's define a function to encapsulate all cleaning activities
def clean_text(message: str, transform='stem') -> str:
    """
    Performs cleaning & lemmatizationf for a supplied string and returns clean string
    transform: 'stem' or 'lemma'
    """
    # Converting to Lowercase
    clean_message = message.lower()

    STOPWORDS = stopwords.words('english')

    # Check characters to see if they are in punctuation
    nopunc = [char for char in clean_message if char not in string.punctuation]

    # Join the characters again to form the string.
    clean_message = ''.join(nopunc)

    # Now just remove any stopwords
    clean_message = ' '.join([word for word in clean_message.split() if word not in STOPWORDS])
    
    if(transform == 'stem'):
        # Stem words using the Stemming function
        return ''.join(stem_sentence(word_tokenize(clean_message)))
    else:
        # Lemmatize words using the Lemmatize function
        return ''.join(lemmatize_sentence(word_tokenize(clean_message)))


def build_model(clf, data, textcol: str, targetcol: str, clean=False, text_transform='stem'):
    """
    Returns the classification report and built model based on data & inputs supplied
    text_transform: 'stem' or 'lemma'
    """
    if(clean):
        X = data[textcol].apply(clean_text, transform = text_transform)
    else:
        X = data[textcol]
    y = data[targetcol]

    # Split into test and training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    # Create and fit pipeline for text data
    pipe = Pipeline([('bow', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
    pipe.fit(X_train, y_train)

    # Print the output on test data
    print("Classification Report Test Data")
    print(metrics.classification_report(y_test, pipe.predict(X_test)))

    # Return the built model
    return pipe

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shobhit
[nltk_data]     Kulshreshtha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Shobhit
[nltk_data]     Kulshreshtha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
# Initializing models to use
mnb = MultinomialNB()
logreg = LogisticRegression()

# Training models
NB = build_model(mnb, data, 'cleaned_hm', 'predicted_category', clean=False)
LR = build_model(logreg, data, 'cleaned_hm', 'predicted_category', clean=False)

Classification Report Test Data
                  precision    recall  f1-score   support

     achievement       0.79      0.89      0.83      4028
       affection       0.62      0.96      0.75      4227
         bonding       0.98      0.30      0.46      1315
enjoy_the_moment       0.85      0.24      0.37      1287
        exercise       0.00      0.00      0.00       146
         leisure       0.95      0.25      0.40       823
          nature       0.00      0.00      0.00       239

        accuracy                           0.71     12065
       macro avg       0.60      0.38      0.40     12065
    weighted avg       0.74      0.71      0.66     12065

Classification Report Test Data
                  precision    recall  f1-score   support

     achievement       0.89      0.96      0.92      4028
       affection       0.95      0.97      0.96      4227
         bonding       0.97      0.92      0.94      1315
enjoy_the_moment       0.86      0.78      0.81      1287
    

In [33]:
# Generating predictions
output_nb = NB.predict(submission['cleaned_hm']) #Not cleaning as clean was set to False
output_lr = LR.predict(submission['cleaned_hm']) #Not cleaning as clean was set to False

In [34]:
# Creating output for submission
# Naive Bayes
submission = pd.concat([submission['hmid'],pd.Series(output_nb)], axis=1)
submission = submission.rename({0:'predicted_category'}, axis=1)
submission.to_csv('submission_1.csv',index=False)
# Logistic Regression
submission = pd.concat([submission['hmid'],pd.Series(output_lr)], axis=1)
submission = submission.rename({0:'predicted_category'}, axis=1)
submission.to_csv('submission_2.csv',index=False)