In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
train_test_dtypes = {'id':str, 'teacher_id':str, 'teacher_prefix':str, 'school_state':str, 'project_submitted_datetime':str, 'project_grade_category':str, 'project_subject_categories':str,
                     'project_subject_subcategories':str, 'project_title':str, 'project_essay_1':str, 'project_essay_2':str, 'project_essay_3':str, 'project_essay_4':str, 'project_resource_summary':str, 
                     'teacher_number_of_previously_posted_projects':int, 'project_is_approved':int}

In [None]:
train_data_raw = pd.read_csv('../input/train.csv', sep=',', dtype=train_test_dtypes, low_memory=True)
test_data_raw = pd.read_csv('../input/test.csv', sep=',', dtype=train_test_dtypes, low_memory=True)
resource_data_raw = pd.read_csv('../input/resources.csv', sep=',')

In [None]:
train_data_raw.info()

In [None]:
test_data_raw.info()

In [None]:
resource_data_raw.info()

In [None]:
cat_features = ['project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'teacher_prefix', 'school_state']
text_features = ['project_title', 'project_essay_1', 'project_essay_2', 'project_essay_3', 'project_essay_4', 'project_resource_summary']
temp_text_features1 = ['project_title', 'project_essay_1', 'project_essay_2']
temp_text_features2 = ['project_essay_3', 'project_essay_4', 'project_resource_summary']

Visualizations of the first 2 rows of categorical data in the training dataset

In [None]:
train_data_raw[cat_features].head(2)

Visualizations of the first 2 rows of numeric data from the resources dataset

In [None]:
resource_data_raw.head(2)

Visualizations of the first 2 rows of text data in the training dataset

In [None]:
train_data_raw[text_features].head(2)

Change the project_submitted_datetime column to type datetime64, and extract year and month as new features

In [None]:
train_data_raw['year'] = train_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[0])
train_data_raw['month'] = train_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[1])
train_data_raw['project_submitted_datetime'] = pd.to_datetime(train_data_raw['project_submitted_datetime'], format="%Y-%m-%d %H:%M:%S")
test_data_raw['year'] = test_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[0])
test_data_raw['month'] = test_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[1])
test_data_raw['project_submitted_datetime'] = pd.to_datetime(test_data_raw['project_submitted_datetime'], format="%Y-%m-%d %H:%M:%S")

There seems to be a spike in proposal submissions in August and September, the beginning of the school year

In [None]:
sns.distplot(train_data_raw.project_submitted_datetime.dt.month, kde=False, bins=12)
plt.title("Dist plot of Months in which projects are submitted");

There is an imbalance in the dataset betwen approved and rejected project proposals. In the training dataset, 154,346 (85%) proposals were approved, 27,734 were rejected, 

In [None]:
print(train_data_raw['project_is_approved'].value_counts())
print("\nPercentage of proposals approved = {}%".format(train_data_raw['project_is_approved'].value_counts()[1] / len(train_data_raw['project_is_approved'])))
plt.figure(figsize=(10,3));
plt.title('Project approval imbalance in training data')
sns.countplot(x=train_data_raw['project_is_approved']);

**EDA of numeric features**

In [None]:
resource_data_raw['total'] = resource_data_raw['quantity'] * resource_data_raw['price']
resource_data_raw.head()

In [None]:
totals_by_prop_id = resource_data_raw[['id', 'total']].groupby('id').total.agg(sum)
print("Max proposal amount request: {}".format(totals_by_prop_id.max()))
print("Min proposal amount request: {}".format(totals_by_prop_id.min()))
print("Avg proposal amount request: {}".format(totals_by_prop_id.mean()))
print("Median proposal amount request: {}".format(totals_by_prop_id.median()))

In [None]:
resource_data_raw[['id', 'price']].groupby('id').max().max()

In [None]:
res = resource_data_raw[['id', 'total']].groupby('id').total.agg(\
    [
        'count', 
        'sum', 
        'min', 
        'max', 
        'mean', 
        'median',
        'std',
    ]).reset_index()
print(res.head())

In [None]:
train_data_raw = train_data_raw.merge(res, on='id')
test_data_raw = test_data_raw.merge(res, on='id')

In [None]:
train_data_raw[train_data_raw.isnull().any(axis=1)].head(2)

Some of the STDDEV values end up being NAN. These values occurred when there was only one resource requested of quantity 1. Fill these STDDEV NAN values with 0.0

In [None]:
values = {'std': 0.0}
train_data_raw.fillna(value=values, inplace=True)
test_data_raw.fillna(value=values, inplace=True)

Max and average sums of rejected and approved projects

In [None]:
print('Max sum requested for rejected and approved proposals.')
train_data_raw[['project_is_approved', 'sum']].groupby('project_is_approved').max().rename(columns={'sum':'max'})

In [None]:
print('Average sum requested for rejected and approved proposals.')
train_data_raw[['project_is_approved', 'sum']].groupby('project_is_approved').mean().rename(columns={'sum':'average'})

In [None]:
print('Max number of previous proposals: approved vs. rejected')
train_data_raw[['project_is_approved','teacher_number_of_previously_posted_projects']].groupby('project_is_approved').max().rename(columns={'teacher_number_of_previously_posted_projects':'teacher_number_of_previously_posted_projects (max)'})

In [None]:
print('Average number of previous proposals: approved vs. rejected')
train_data_raw[['project_is_approved','teacher_number_of_previously_posted_projects']].groupby('project_is_approved').mean().rename(columns={'teacher_number_of_previously_posted_projects':'teacher_number_of_previously_posted_projects (avg)'})

In [None]:
teacher_max_number_of_previous_proposals = train_data_raw[['teacher_id', 'teacher_number_of_previously_posted_projects']].groupby('teacher_id').teacher_number_of_previously_posted_projects.agg(max)
print("Highest number of previous proposals: {}".format(teacher_max_number_of_previous_proposals.max()))
print("Lowest number of previous proposals: {}".format(teacher_max_number_of_previous_proposals.min()))
print("Avg number of previous proposals: {}".format(teacher_max_number_of_previous_proposals.mean()))
print("Median number of previous proposals: {}".format(teacher_max_number_of_previous_proposals.median()))

The benchmark used in this project, supplied by Google's engineering education team (https://www.kaggle.com/skleinfeld/getting-started-with-the-donorschoose-data-set), theorizes that the feature teacher_number_of_previously_posted_projects may result in a higher probability of approval as a teacher has submitted more proposals, becoming more familiar with how to format their submission. The benchmark resulted in a AUROC score of 0.56522.

The next histogram/KDE plot shows that as the number of previous submissions increases, the mean approval rate also increases.

In [None]:
teacher_number_previous_submissions_mean = train_data_raw[['teacher_number_of_previously_posted_projects', 'project_is_approved']].groupby('teacher_number_of_previously_posted_projects').mean()
#sns.distplot(teacher_number_previous_submissions_mean, kde=True, bins=15);
sns.distplot(teacher_number_previous_submissions_mean['project_is_approved'], bins=15)
sns.despine()
plt.yticks([])
plt.xticks([])

plt.ylabel('Approval Rate');
plt.xlabel('Number of submissions')
plt.title('Approval rates as number of previously submitted proposals increases');

**EDA of categorical features**

In the training dataset, there are 4 NAN values in the teacher_prefix column. We will just drop these rows

In [None]:
train_data_raw.dropna(subset=['teacher_prefix'], inplace=True)

The test dataset also has 1 NAN value in the teacher_prefix column. However, we do not want to drop this row as the competition submissions expect the same number of rows (78,035) for predictions. Therefore, we will fill this NAN to be 'Teacher'

In [None]:
values = {'teacher_prefix': 'Teacher'}
test_data_raw.fillna(value=values, inplace=True)

As shown below, the number of proposals submitted for CA is very high, but drops off across states, and is low for VT and other low population states. In spite of this, the approval rate for proposals across states vary some, but not too much, remaining above 80% approval rate.

In [None]:
plt.figure(figsize=(30,2));
plt.title('Histogram of proposals submitted by state')
sns.countplot(x=train_data_raw['school_state'], order=train_data_raw['school_state'].value_counts().index, hue=train_data_raw['project_is_approved'])

In [None]:
train_data_raw[['school_state', 'project_is_approved']].groupby('school_state').mean().sort_values(by='project_is_approved', ascending=False).plot.bar(figsize=(30,2), grid=True, title='Approval Rates by school_state');

The number of proposals varies by the type of subject categories, and the approval rates show some variance. Note that the last category which has 0 approvals only has 1 submission.

In [None]:
plt.figure(figsize=(30,2))
sns.countplot(x=train_data_raw['project_subject_categories'], hue=train_data_raw['project_is_approved'], order=train_data_raw['project_subject_categories'].value_counts().index);
plt.xticks(rotation=90);
plt.title('Histogram of proposals submitted by project_subject_categories')

In [None]:
train_data_raw[['project_subject_categories', 'project_is_approved']].groupby('project_subject_categories').mean().sort_values(by='project_is_approved', ascending=False).plot.bar(figsize=(30,2), grid=True, title='Approval Rates by project_subject_categories');

In [None]:
train_data_raw[['project_subject_categories', 'project_is_approved']].groupby('project_subject_categories')['project_is_approved'].agg(['mean','count']).sort_values('mean', ascending=False).tail(5)

As project_subject_subcategories has a huge number of combinations, only the first 30 entries for number of proposals is shown. There is a large variance in proposals for subcategories. The full set of subcategories is shown for approval rates. There is a large variance in approval rates here. Note that the subcategories with highest (100% approval rate) and lowest (0% approval rate) only had one submission each.

In [None]:
plt.figure(figsize=(30,2))
sns.countplot(x=train_data_raw['project_subject_subcategories'], hue=train_data_raw['project_is_approved'], order=train_data_raw['project_subject_subcategories'].value_counts().iloc[:30].index);
plt.xticks(rotation=90);
plt.title('Histogram of proposals submitted by project_subject_subcategories (30 highest)')

In [None]:
train_data_raw[['project_subject_subcategories', 'project_is_approved']].groupby('project_subject_subcategories').mean().sort_values(by='project_is_approved', ascending=False).plot.bar(figsize=(30,2), grid=True, title='Approval Rates by project_subject_subcategories (X labels not shown)');
plt.axis('off')
plt.show()

In [None]:
train_data_raw[['project_subject_subcategories', 'project_is_approved']].groupby('project_subject_subcategories')['project_is_approved'].agg(['mean','count']).sort_values('mean', ascending=False).head(10)

In [None]:
train_data_raw[['project_subject_subcategories', 'project_is_approved']].groupby('project_subject_subcategories')['project_is_approved'].agg(['mean','count']).sort_values('mean', ascending=False).tail(10)

There is some variance in the number of proposals across grade categories, but the approval rate is similar for each

In [None]:
plt.figure(figsize=(30,2))
sns.countplot(x=train_data_raw['project_grade_category'], hue=train_data_raw['project_is_approved'], order=train_data_raw['project_grade_category'].value_counts().index);
plt.title('Histogram of proposals submitted by project_grade_category')

In [None]:
train_data_raw[['project_grade_category', 'project_is_approved']].groupby('project_grade_category').mean().sort_values(by='project_is_approved', ascending=False).plot.bar(figsize=(30,2), grid=True, title='Approval Rates by project_grade_category');

There is some variance across teacher prefixes, but the approval rates are mostly similar.

In [None]:
plt.figure(figsize=(30,2))
sns.countplot(x=train_data_raw['teacher_prefix'], hue=train_data_raw['project_is_approved'], order=train_data_raw['teacher_prefix'].value_counts().index);
plt.title('Histogram of proposals submitted by teacher_prefix')

In [None]:
train_data_raw[['teacher_prefix', 'project_is_approved']].groupby('teacher_prefix').mean().sort_values(by='project_is_approved', ascending=False).plot.bar(figsize=(30,2), grid=True, title='Approval Rates by teacher_prefix');

**EDA of text features**

The description feature in the resources dataset has NAN values for 292 entries. However, the quantity and price fields are still good. We wll fill these NAN descriptions with empty text.

In [None]:
resource_data_raw.info()

In [None]:
resource_data_raw[resource_data_raw.isnull().any(axis=1)].head()

In [None]:
resource_data_raw.fillna('', inplace=True)
resource_data_raw.info()

Concatenate all of the descriptions for each proposal id together into a pivot table, which will then be merged into the test and training datasets

In [None]:
pivot_table = resource_data_raw.groupby('id').description.apply(lambda x: "%s" % ';'.join(x)).reset_index()

In [None]:
train_data_raw = train_data_raw.merge(pivot_table, on='id')
test_data_raw = test_data_raw.merge(pivot_table, on='id')

Many of the project_essay_3 and project_essay_4 entries are NAN. This is expected for submissions after May 17, 2016. Since the old essays 1&2 and 3&4 are similar to new essays 1 and 2 respectively, let's then concatenate the essays then drop columns 3 and 4

In [None]:
essay_3_4_nonull_filter = train_data_raw.project_essay_3.notnull()

train_data_raw.loc[essay_3_4_nonull_filter,'project_essay_1'] = train_data_raw[essay_3_4_nonull_filter].project_essay_1.str.cat(train_data_raw[essay_3_4_nonull_filter].project_essay_2)
train_data_raw.loc[essay_3_4_nonull_filter, 'project_essay_2'] = train_data_raw[essay_3_4_nonull_filter].project_essay_3.str.cat(train_data_raw[essay_3_4_nonull_filter].project_essay_4)

train_data_raw.drop(['project_essay_3', 'project_essay_4'], axis=1, inplace=True)

test_essay_3_4_nonull_filter = test_data_raw.project_essay_3.notnull()

test_data_raw.loc[test_essay_3_4_nonull_filter,'project_essay_1'] = test_data_raw[test_essay_3_4_nonull_filter].project_essay_1.str.cat(test_data_raw[test_essay_3_4_nonull_filter].project_essay_2)
test_data_raw.loc[test_essay_3_4_nonull_filter, 'project_essay_2'] = test_data_raw[test_essay_3_4_nonull_filter].project_essay_3.str.cat(test_data_raw[test_essay_3_4_nonull_filter].project_essay_4)

test_data_raw.drop(['project_essay_3', 'project_essay_4'], axis=1, inplace=True)

In [None]:
train_data_raw.info()

Collect some stats on the max and mean length of the text features

In [None]:
text_features_final = ['project_title', 'project_essay_1', 'project_essay_2', 'project_resource_summary', 'description']

str_df_approved = pd.DataFrame()
for col in text_features_final:
    str_df_approved[col] = train_data_raw[train_data_raw.project_is_approved==1][col].str.len()
    
str_df_rejected = pd.DataFrame()
for col in text_features_final:
    str_df_rejected[col] = train_data_raw[train_data_raw.project_is_approved==0][col].str.len()

In [None]:
str_stats_approved_df = pd.DataFrame()
for col in str_df_approved:
    str_stats_approved_df[col] = str_df_approved[col].agg(['min', 'max', 'mean'])

str_stats_rejected_df = pd.DataFrame()
for col in str_df_rejected:
    str_stats_rejected_df[col] = str_df_rejected[col].agg(['min', 'max', 'mean'])

In [None]:
str_stats_approved_df

In [None]:
str_stats_rejected_df

In [None]:
train_data_raw[['project_is_approved', 'project_title']].groupby('project_is_approved').describe()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
corpus = train_data_raw.project_essay_1
vec = CountVectorizer(stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0) 
full_words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
full_words_freq = sorted(full_words_freq, key = lambda x: x[1], reverse=True)

In [None]:
corpus = train_data_raw[train_data_raw.project_is_approved==1].project_essay_1
vec = CountVectorizer(stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0) 
approved_words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
approved_words_freq = sorted(approved_words_freq, key = lambda x: x[1], reverse=True)

In [None]:
corpus = train_data_raw[train_data_raw.project_is_approved==0].project_essay_1
vec = CountVectorizer(stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0) 
rejected_words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
rejected_words_freq = sorted(rejected_words_freq, key = lambda x: x[1], reverse=True)

In [None]:
full_list_x = [x for x,y in full_words_freq[0:20]]
full_list_y = [y for x,y in full_words_freq[0:20]]

approved_list_x = [x for x,y in approved_words_freq[0:20]]
approved_list_y = [y for x,y in approved_words_freq[0:20]]

rejected_list_x = [x for x,y in rejected_words_freq[0:20]]
rejected_list_y = [y for x,y in rejected_words_freq[0:20]]

In [None]:
#plt.figure(figsize=(40,10))
sns.set(font_scale=2);
f, (ax2, ax3) = plt.subplots(1, 2, sharex=False, sharey=False, figsize=(40,10));
#sns.barplot(x=full_list_y, y=full_list_x, ax=ax1);
#ax1.set(xlabel='Word densities for ')
sns.barplot(x=approved_list_y, y=approved_list_x, ax=ax2);
ax2.set(xlabel="Essay1 top word frequencies in approved proposals");
sns.barplot(x=rejected_list_y, y=rejected_list_x, ax=ax3);
ax3.set(xlabel="Essay1 top word frequencies in rejected proposals");
plt.suptitle('Most frequent words in essay1');

In [None]:
str_df_charachters = pd.DataFrame()
str_df_charachters['project_is_approved'] = train_data_raw.project_is_approved
for col in text_features_final:
    str_df_charachters[col] = train_data_raw[col].str.len()

In [None]:
str_df_num_words = pd.DataFrame()
str_df_num_words['project_is_approved'] = train_data_raw.project_is_approved
for col in text_features_final:
    str_df_num_words[col] = train_data_raw[col].str.split().str.len()

In [None]:
str_df_word_density = pd.DataFrame()
str_df_word_density['project_is_approved'] = train_data_raw.project_is_approved
for col in text_features_final:
    str_df_word_density[col] =  str_df_num_words[col] / str_df_charachters[col]

In [None]:
str_df_word_density.head()

In [None]:
str_df_word_density.groupby('project_is_approved').agg(['min', 'max', 'mean'])

Explore some visualizations for the Free-Form Visualization section of the Capstone Report

In [None]:
fig, axes = plt.subplots();
axes.violinplot(dataset=[str_df_word_density[str_df_word_density.project_is_approved==1]['project_title'], str_df_word_density[str_df_word_density.project_is_approved==0]['project_title']]);
axes.set_title('project_title Word Densities');
axes.yaxis.grid(True);
axes.set_xlabel('Rejected/Approved');
axes.set_ylabel('');
axes.set_xticklabels([]);

In [None]:
fig, axes = plt.subplots();
axes.violinplot(dataset=[str_df_word_density[str_df_word_density.project_is_approved==1]['project_essay_1'], str_df_word_density[str_df_word_density.project_is_approved==0]['project_essay_1']]);
axes.set_title('project_essay_1 Word Densities');
axes.yaxis.grid(True);
axes.set_xlabel('Project is approved');
axes.set_ylabel('snark');

In [None]:
fig, axes = plt.subplots();
axes.violinplot(dataset=[str_df_word_density[str_df_word_density.project_is_approved==1]['project_essay_2'], str_df_word_density[str_df_word_density.project_is_approved==0]['project_essay_2']]);
axes.set_title('project_essay_2 Word Densities');
axes.yaxis.grid(True);
axes.set_xlabel('Rejected/Approved');
axes.set_ylabel('');
axes.set_xticklabels([]);

In [None]:
fig, axes = plt.subplots();
axes.violinplot(dataset=[str_df_word_density[str_df_word_density.project_is_approved==1]['project_resource_summary'], str_df_word_density[str_df_word_density.project_is_approved==0]['project_resource_summary']]);
axes.set_title('project_resource_summary Word Densities');
axes.yaxis.grid(True);
axes.set_xlabel('Rejected/Approved');
axes.set_ylabel('');
axes.set_xticklabels([]);

In [None]:
fig, axes = plt.subplots();
axes.violinplot(dataset=[str_df_word_density[str_df_word_density.project_is_approved==1]['description'], str_df_word_density[str_df_word_density.project_is_approved==0]['description']]);
axes.set_title('description Word Densities');
axes.yaxis.grid(True);
axes.set_xlabel('Rejected/Approved');
axes.set_ylabel('');
axes.set_xticklabels([]);

In [None]:
fig, axes = plt.subplots();
axes.violinplot(dataset=[np.log(train_data_raw[train_data_raw.project_is_approved==0]['mean']), np.log(train_data_raw[train_data_raw.project_is_approved==1]['mean'])]);
axes.set_title('Project price means');
axes.yaxis.grid(True);
axes.set_xlabel('Rejected/Approved');
axes.set_ylabel('');
axes.set_xticklabels([]);

In [None]:
approval_rates_by_month = train_data_raw[['project_is_approved', 'month']].groupby('month').mean().reset_index()
approval_rates_by_month

In [None]:
ax1 = sns.barplot(x=approval_rates_by_month.month, y=approval_rates_by_month.project_is_approved)
ax1.set(xlabel='Months', ylabel='Approval Rates');

In [None]:
approval_rates_by_subcategory = train_data_raw[['project_subject_subcategories', 'project_is_approved']].groupby('project_subject_subcategories').mean().reset_index()
submission_counts_by_subcategory = train_data_raw[['project_subject_subcategories', 'project_is_approved']].groupby('project_subject_subcategories').count().reset_index()
subcategory_df = approval_rates_by_subcategory.merge(submission_counts_by_subcategory, on='project_subject_subcategories')
subcategory_df = subcategory_df.sort_values(by='project_is_approved_x', ascending=True)

In [None]:
plt.figure(figsize=(30,10));
ax1 = sns.barplot(x=subcategory_df.project_is_approved_x, y=subcategory_df.project_is_approved_y, palette='dark');
plt.ylim(0,2000)
labels = [item.get_text() for item in ax1.get_xticklabels()]
for i in range(len(labels)):
    labels[i]=''
labels[0]=0
labels[-1] = 1
halfwaythere = len(labels)//2
labels[halfwaythere] = .5
ax1.set_xticklabels(labels);
ax1.set(ylabel='Number of submissions', xlabel='Approval Rates')

ax1.set_title('Number of submissions per approval rate');
axes.yaxis.grid(True);

In [None]:
plt.figure(figsize=(30,10));
axes = sns.distplot(subcategory_df.project_is_approved_y, bins=500);
axes.set(xlabel='Number of proposal submissions', ylabel='');

In [None]:
plt.figure(figsize=(30,20))
g = sns.countplot(x=train_data_raw['project_subject_subcategories'], order=train_data_raw['project_subject_subcategories'].value_counts().index);
#g.set_yscale('log')
g.set(xlabel='Project Subcategories, sorted by number of submissions (407 subcategory labels turned off)', ylabel='Number of proposal submissions')
plt.xticks([]);
plt.title('Histogram of number of proposals submitted by project_subject_subcategories')

In [None]:
print("Categories with 10 lowest approval rates, and the number of submissions for those subcategories")
subcategory_df[:10]

In [None]:
print("Categories with 50 highest approval rates, and the number of submissions for those subcategories")
subcategory_df[-50:]

In [None]:
len(subcategory_df)

In [None]:
subcategory_base_values = sorted(train_data_raw[~train_data_raw.project_subject_subcategories.str.contains(',')].project_subject_subcategories.unique(), key=len)

In [None]:
import operator

subcat_dict = {}
for subcat in subcategory_base_values:
    subcat_dict[subcat] = train_data_raw[train_data_raw.project_subject_subcategories.str.contains(subcat)][['project_is_approved', 'project_subject_subcategories']].project_is_approved.count()                     

subcat_list = sorted(subcat_dict.items(), key=operator.itemgetter(1))         
#train_data_raw[['school_state', 'project_is_approved']].groupby('school_state').mean()
#subcat_df = pd.DataFrame.from_dict(subcat_dict, orient='index').reset_index()
#subcat_df.columns=['project_subject_subcategories', 'num_submissions']

subcat_approval_rate_dict = {}
for subcat in subcategory_base_values:
    subcat_approval_rate_dict[subcat] = train_data_raw[train_data_raw.project_subject_subcategories.str.contains(subcat)][['project_is_approved', 'project_subject_subcategories']].project_is_approved.mean()                     

subcat_approval_rate_list = sorted(subcat_approval_rate_dict.items(), key=operator.itemgetter(1)) 

In [None]:
subcat_df = pd.DataFrame(subcat_list, columns=['project_subject_subcategories', 'num_submissions'])
subcat_df

In [None]:
plt.figure(figsize=(30,5))
g = sns.barplot(x=subcat_df.project_subject_subcategories, y=subcat_df.num_submissions);
g.set(xlabel='Subcategories', ylabel='Number of submissions');
g.set_xticklabels(g.get_xticklabels(), rotation=9);
plt.title('Number of submissions per base category in project_subject_subcategories');

In [None]:
subcat_approval_rate_df = pd.DataFrame(subcat_approval_rate_list, columns=['project_subject_subcategories', 'approval_rates'])
subcat_approval_rate_df

In [None]:
plt.figure(figsize=(30,5));
g = sns.barplot(x=subcat_approval_rate_df.project_subject_subcategories, y=subcat_approval_rate_df.approval_rates);
g.set(xlabel='Subcategories', ylabel='Approval Rates');
g.set_xticklabels(g.get_xticklabels(), rotation=90);
plt.title('Approval rate per base category in project_subject_subcategories');

In [None]:
subcat_df = subcat_df.merge(subcat_approval_rate_df, on='project_subject_subcategories')
subcat_df