In [None]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk

import json 
import pandas as pd 
from pandas import json_normalize
import matplotlib.pyplot as plt

### Load Data

In [None]:
#Load training data
with open('../data/random-acts-of-pizza/train.json') as f:
    train_json = json.load(f)
    
# Load test data
with open('../data/random-acts-of-pizza/test.json') as f:
    test_json = json.load(f)


### Clean Input

In [None]:
train = json_normalize(train_json)
test = json_normalize(test_json)

train["request_title"] = train["request_title"].str.lower()
test["request_title"] = test["request_title"].str.lower()

train["request_title"] = train["request_title"].str.replace('\[request\]','')
test["request_title"] = test["request_title"].str.replace('\[request\]','')
train["request_title"] = train["request_title"].str.replace('request','')
test["request_title"] = test["request_title"].str.replace('request','')
train['request_text'] = train['request_text'].str.lower()

### Divide data into training and dev sets

In [None]:
train_data = train[:2800]
train_labels = train[:2800]['requester_received_pizza']
test_data = test[:]

dev_data = train[2800:]
dev_labels = train[2800:]['requester_received_pizza']

train_data_title = train_data['request_title']
dev_data_title = dev_data['request_title']
test_data_title = test_data['request_title']

### Inspect Data

In [None]:
print('training data shape:', train_data.shape)
print('training label shape:', train_labels.shape)
print('dev data shape:',     dev_data.shape)
print('dev label shape:',      dev_labels.shape)

print('training data shape title:',      train_data_title.shape)
print('dev data shape title:',      dev_data_title.shape)
train.head()

Our task is to detect which posts result in pizza and which do not. 

In [None]:
train.describe()

### Examine Data

 1. For first 5 training examples, print the title of request

In [None]:
def display_request(num_examples=5):

    for i in range(num_examples):
        print(train_data.iloc[i]['request_title']) 
        print(train_data.iloc[i]['request_text'])
        print('Received Pizza: ', train_data.iloc[i]['requester_received_pizza']) 
        print('\n')  

        
display_request(5)

In [None]:
#Baseline accuracy
train_data.groupby('requester_received_pizza').size().plot(kind = "bar")


In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

train["tokens"] = train["request_title"].apply(tokenizer.tokenize)
train.head()

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

all_words = [word for tokens in train["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in train["tokens"]]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

In [None]:


fig = plt.figure(figsize=(10, 10)) 
plt.xlabel('Title length')
plt.ylabel('Number of sentences')
plt.hist(sentence_lengths)
plt.show()

### Classification

In [None]:
vectorizer = CountVectorizer()
vector = vectorizer.fit_transform(train_data_title)
vocab_train = vectorizer.vocabulary_

print('Size of vocabulary: ', vector.shape[1])
print("0th feature: ", vectorizer.get_feature_names_out()[0])
print("Last feature: ", vectorizer.get_feature_names_out()[-1])
print("Average number of non-zero features per example: ", round(np.average([row.nnz for row in vector]),3))
sparsity = round((vector.nnz / (vector.shape[0] * vector.shape[1])),3)
print(f'Fraction of the non-zero entries in the matrix - Sparsity: {sparsity}')

vectorizer_dev = CountVectorizer()
devvector = vectorizer_dev.fit_transform(dev_data)
vocab_dev = vectorizer_dev.vocabulary_
dev_missing_words = set(vocab_train.keys()) - set(vocab_dev.keys())
print('Dev vocab missing from the training vocab size: ', len(dev_missing_words))
print('Fraction of words in dev vocab missing from the training vocab: ', round(len(dev_missing_words)/len(vocab_train),3))
        


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

def text_preprocessor(text):
    text = text.lower()
    text = re.sub("\W",' ',text) # replace non-alphanumeric
    text = re.sub("_",' ',text) # replace non-alphanumeric
    text = re.sub('\n', '', text)   
    return text

vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english', preprocessor=text_preprocessor)
train_vector = vectorizer.fit_transform(train_data_title)

dev_vector = vectorizer.transform(dev_data_title)

#Produce several Naive Bayes models by varying smoothing (alpha), including one with alpha set approximately to optimize f1 score
print('\n***  Naive Bayes model ***') 


mnb = MultinomialNB(alpha = 0.5)
mnb.fit(train_vector, train_labels)

#Evaluate performance on the dev set.
pred_mnb = mnb.predict(dev_vector)
score = metrics.f1_score(dev_labels, pred_mnb, average="weighted")
print(f"a = 0.5, f1 score = {score}")

accuracy, precision, recall, f1 = get_metrics(dev_labels, pred_mnb)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

In [None]:


def get_most_important_features(vectorizer, model, n=5):
    index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
    
    # loop for each class
    classes ={}
    for class_index in range(model.coef_.shape[0]):
        word_importances = [(el, index_to_word[i]) for i,el in enumerate(model.coef_[class_index])]
        sorted_coeff = sorted(word_importances, key = lambda x : x[0], reverse=True)
        tops = sorted(sorted_coeff[:n], key = lambda x : x[0])
        bottom = sorted_coeff[-n:]
        classes[class_index] = {
            'tops':tops,
            'bottom':bottom
        }
    return classes

vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english', preprocessor=text_preprocessor)
train_vector = vectorizer.fit_transform(train_data_title)

lr = LogisticRegression(C=.5, solver="liblinear", multi_class="auto")
lr.fit(train_vector, train_labels)
pred_lr = lr.predict(dev_vector)
score = metrics.f1_score(dev_labels, pred_lr, average="weighted")
print('\n***  Logistic Regression model ***') 
print(f"a = 0.5, f1 score = {score}")

accuracy, precision, recall, f1 = get_metrics(dev_labels, pred_lr)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))        

importance = get_most_important_features(vectorizer, lr, 10)

In [None]:
importance[0]['tops']

In [None]:

def plot_important_words(top_scores, top_words, bottom_scores, bottom_words, name):
    y_pos = np.arange(len(top_words))
    top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]
    top_pairs = sorted(top_pairs, key=lambda x: x[1])
    
    bottom_pairs = [(a,b) for a,b in zip(bottom_words, bottom_scores)]
    bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)
    
    top_words = [a[0] for a in top_pairs]
    top_scores = [a[1] for a in top_pairs]
    
    bottom_words = [a[0] for a in bottom_pairs]
    bottom_scores = [a[1] for a in bottom_pairs]
    
    fig = plt.figure(figsize=(10, 10))  

    plt.subplot(121)
    plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)
    plt.title('No Pizza', fontsize=20)
    plt.yticks(y_pos, bottom_words, fontsize=14)
    plt.suptitle('Key words', fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplot(122)
    plt.barh(y_pos,top_scores, align='center', alpha=0.5)
    plt.title('Pizza', fontsize=20)
    plt.yticks(y_pos, top_words, fontsize=14)
    plt.suptitle(name, fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplots_adjust(wspace=0.8)
    plt.show()

top_scores = [a[0] for a in importance[0]['tops']]
top_words = [a[1] for a in importance[0]['tops']]
bottom_scores = [a[0] for a in importance[0]['bottom']]
bottom_words = [a[1] for a in importance[0]['bottom']]

plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance")

In [None]:
 #Tfidf Vectorizer
tfidfvectorizer = TfidfVectorizer()
train_vector_tfIdf = tfidfvectorizer.fit_transform(train_data_title)

 #transform dev tfidfvectorizer
dev_vector_tfIdf = tfidfvectorizer.transform(dev_data_title) 

#default is penalty="l2"
lr_tfIdf =  LogisticRegression(C=100, solver="liblinear", multi_class="auto")
lr_tfIdf.fit(train_vector_tfIdf, train_labels)   

pred_tfIdf = lr_tfIdf.predict(dev_vector_tfIdf)

score = metrics.f1_score(dev_labels, pred_tfIdf, average="weighted")
print(f"\n******** Tfidf Vectorizer ********")
print(f"\nTfidf C = 100, f1 score = {score}, vocab size =", len(tfidfvectorizer.vocabulary_)) 


In [None]:
# Understand nature of the data .info() .describe()
# Histograms and boxplots 
# Value counts 
# Missing data 
# Correlation between the metrics 
# Explore interesting themes 
    # Wealthy survive? 
    # By location 
    # Age scatterplot with ticket price 
    # Young and wealthy Variable? 
    # Total spent? 
# Feature engineering 
# preprocess data together or use a transformer? 
    # use label for train and test   
# Scaling?

# Model Baseline 
# Model comparison with CV 

In [None]:
print(no_pizza["requester_account_age_in_days_at_request"].mean())
print(pizza["requester_account_age_in_days_at_request"].mean())
print('\nrequest_number_of_comments_at_retrieval')
print(no_pizza["request_number_of_comments_at_retrieval"].mean())
print(pizza["request_number_of_comments_at_retrieval"].mean())
print('\nnumber_of_upvotes_of_request_at_retrieval')
print(no_pizza["number_of_upvotes_of_request_at_retrieval"].mean())
print(pizza["number_of_upvotes_of_request_at_retrieval"].mean())
print('\nrequester_number_of_comments_in_raop_at_request')
print(no_pizza["requester_number_of_comments_in_raop_at_request"].mean())
print(pizza["requester_number_of_comments_in_raop_at_request"].mean())
print('\nrequester_number_of_posts_at_retrieval')
print(no_pizza["requester_number_of_posts_at_retrieval"].mean())
print(pizza["requester_number_of_posts_at_retrieval"].mean())
print('\nrequester_number_of_subreddits_at_request')
print(no_pizza["requester_number_of_subreddits_at_request"].mean())
print(pizza["requester_number_of_subreddits_at_request"].mean())

no_pizza["requester_user_flair"]

In [None]:
print(len(pizza["request_title"])