In [None]:
# Data and Data Structures
import json
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
from subprocess import check_output

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor

# Visualization
%matplotlib inline
import matplotlib as mpl
from matplotlib.colors import ListedColormap
from matplotlib.colors import LogNorm
import matplotlib.pyplot as plt
plt.style.use('bmh')
from wordcloud import WordCloud, STOPWORDS

# # Load dataset

In [None]:
# Load Train Data
with open('../input/random-acts-of-pizza/train.json') as fin:
    trainjson = json.load(fin)
train = pd.io.json.json_normalize(trainjson)

# Load Test Data
with open('../input/random-acts-of-pizza/test.json') as fin:
    testjson = json.load(fin)
test = pd.io.json.json_normalize(testjson)

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)

# Check data, we can see there are two types: numeric and textual.
# The first step is business understanding. This is a classification problem. It seems text feature "request_text" "request_text_edit_awar" play the crtical role in this task. Then this will be a NLP feature representation problem. Namely, how to convert the text feature into a meaningful vector such that ML models, such as XGboost could handle. We could use some popular NLP packages such as word embending, bag-of-word, or Bert method to model the text feature.

In [None]:
train.head()

In [None]:
test.head()

# # **From above table, we can see null values, so check missing values**

In [None]:
train.isnull().sum()

# # only 3046 null values in one specific column requester_user_flair which don't not appear in the following test.colums, so we could drop requester_user_flair 

In [None]:
print("Common columns in train and test:")
print(train.columns[train.columns.isin(test.columns)])
print("----")
print("Columns in train but NOT test:")
print(train.columns[~train.columns.isin(test.columns)])

# Data limitions: This dataset contains some attributes that are only available on the train dataset but not on the test dataset, so these attributes cannot actually be utilised. If such kinds of attributes are available on both train and test, it is possible to check the impact of these attributes on the success of the pizza request.

# # Label and features

In [None]:
train_labels_master = train[['requester_received_pizza']]
train_data_master = train[test.columns]

In [None]:
print(train.shape, train_data_master.shape)

In [None]:
train_data_master.describe()
train_data_master.info()

# Check the label ratio of different buckets, False around 3000 but True around 1000, this is a imblance classification problem. So to aviod model bias problem, under sampling or over sampling could be further considered

In [None]:
(sns.countplot(x = train_labels_master.requester_received_pizza).
set_title("# of received vs nor received a pizza"));

#  due to time limit, we can only deal with numberic features in data, however, the text feaure also must deserve to be further analysized

In [None]:
granted = train_data_master[train_labels_master['requester_received_pizza']==True]
ungranted = train_data_master[train_labels_master['requester_received_pizza']==False]

# # The numberic feature distribution with requester_received_pizza == True

In [None]:
df_num = (granted.select_dtypes(include = ['float64', 'int64']))
df_num.shape

fig = df_num.hist(figsize=(16, 10), bins=50, xlabelsize=8, ylabelsize=8, ec="k")
fig = [x.title.set_size(10) for x in fig.ravel()]
fig;

# # The numberic feature distribution with requester_received_pizza == False

# Comparing above figure between label = Ture and False. For example, we can find the distributions of last 2 features are different between True and False.

In [None]:
df_num = (ungranted.select_dtypes(include = ['float64', 'int64']))
df_num.shape

fig = df_num.hist(figsize=(16, 10), bins=50, xlabelsize=8, ylabelsize=8, ec="k")
fig = [x.title.set_size(10) for x in fig.ravel()]
fig;

# # Compute some statistics such as corrlation matrix to explot the relations among features then used for feature selection. The following is lable = Ture

In [None]:
fig, ax = plt.subplots(figsize=(25,25))
Xr = granted
sns.heatmap(Xr.corr(), annot = True,  cbar_kws= {'orientation': 'horizontal'} )

# # Compute some statistics such as corrlation matrix to explot the relations among features, lable = False

In [None]:
fig, ax = plt.subplots(figsize=(25,25))
Xr = ungranted
sns.heatmap(Xr.corr(), annot = True,  cbar_kws= {'orientation': 'horizontal'} )

# #   Generate Word Clouds of Granted and Ungranted Requests, where we could also use some usefel word-vector NLP technicial methods to encode text feature as numberical vector. But due to time limit, we omit this step here and could expolore further. Actually the text feature frequently happens in Customer survey and Customer retention. If I have more time, I will take further steps to improve the performance.
# 1.  generate the additional feature from the textual analysis of the posts and add to the original selected features.we can endcode linguistic feature as some feature representation such as sentiment/words length/Politeness/... and so on 
# 2.  consider high-level topic features engineering such as using Latent Dirichlet Allocation  to explore whether request is dependent on the topic to which it belongs
# 3. If possible, we should incoporate more features as many as we can, but this is based on specific task/data understanding 

In [None]:
mpl.rcParams['font.size']=12        
mpl.rcParams['savefig.dpi']=100         
mpl.rcParams['figure.subplot.bottom']=.1 

stopwords = set(STOPWORDS)

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=50,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(granted['request_text_edit_aware']))

fig = plt.figure(1)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.tight_layout(pad=0)
plt.title("Word Cloud for Granted Requests",fontsize=22, fontweight='bold')
plt.show()

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=50,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(ungranted['request_text_edit_aware']))

fig = plt.figure(1)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.tight_layout(pad=0)
plt.title("Word Cloud for Ungranted Requests",fontsize=22, fontweight='bold')
plt.show()

# Question: What would be your high level recommendations to someone trying to get a free pizza from kind strangers on Reddit circa 2016?
# Answer: According to the word cloud results, the successful requests are polite and emphasise the problems you are facing such as weather and unemployed. Unsuccessful requests are more self-effacing, using the words "hungry" and "need". Recommendations to improve chance to success: write politele, display the difficulty situations and gratitude to the community if your situations improved.

# # Finally, we could combine above  linguistic feature and numberic feature as input to ML models such as XGBoost/Logistic regression/SVM and so on