In [10]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.offline as py
from plotly import tools
import seaborn as sns

import datetime as dt
import pandas as pd
import numpy as np
import warnings
import string

color = sns.color_palette()
warnings.filterwarnings('ignore')
py.init_notebook_mode(connected=True)
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
punctuation = string.punctuation

stop_words = open('data/stopwords.txt').read().strip().split("\n")
stop_words = [x.replace("\r","") for x in stop_words]

from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
id_column = "id"
missing_token = " UNK "

train = pd.read_csv("data/train.csv", index_col=id_column, parse_dates=["project_submitted_datetime"])
test = pd.read_csv("data/test.csv", index_col=id_column, parse_dates=["project_submitted_datetime"])

df = pd.concat([train, test], axis=0) 

In [3]:
rc = pd.read_csv("data/resources.csv", index_col=id_column).fillna(missing_token)
rc['total_price'] = rc.quantity * rc.price
rc['price_sum'] = rc['price'].copy()
rc['quantity_sum'] = rc['quantity'].copy()
rc['quantity_count'] = rc['quantity'].copy()

rc = rc.reset_index().groupby(id_column).agg(dict(quantity_count='count', price_sum='sum', quantity_sum='sum', 
                                                  total_price='mean', quantity='mean', price='mean', 
                                                  description=lambda x: missing_token.join(x)))
df = pd.merge(df, rc, left_index=True, right_index=True, how= "inner")

In [4]:
def cleanup_text(x):
    x = x.replace("\\r", " ").replace("\\t", " ").replace("\\n", " ")
    x = "".join(_ for _ in x if _ not in punctuation)
    x = x.lower()
    return x

def get_polarity(text):
    try:
        textblob = TextBlob(text)
        pol = textblob.sentiment.polarity
    except Exception as E:
        pol = 0.0
    return pol

def get_subjectivity(text):
    try:
        textblob = TextBlob(text)
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

def pos_check(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tupe in wiki.tags:
            ppo = list(tupe)[1]
            if ppo in pos_dic[flag]:
                cnt += 1
    except Exception as E:
        pass
    return cnt 

def add_count_cat(tx, mapping):
    return sum([mapping[_.strip()] for _ in tx.split(",")]) / len(tx.split(","))

def getCountVar(compute_df, var_name, splitter=False):
    if splitter:
        values = []
        for each in df[var_name]:
            allval = each.split(",")
            allval = [x.strip() for x in allval]
            values.extend(allval)
        value_counts = dict(Counter(values))  
        compute_df["Count_"+var_name] = compute_df[var_name].apply(lambda x: add_count_cat(x, value_counts))    
    else:
        grouped_df = compute_df.groupby(var_name, as_index=False).agg('size').reset_index()
        grouped_df.columns = [var_name, "var_count"]
        merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
        merged_df.fillna(-1, inplace=True)
        compute_df["Count_"+var_name] = list(merged_df["var_count"])

In [8]:
df['project_essay_3'] = df['project_essay_3'].fillna(missing_token)
df['project_essay_4'] = df['project_essay_4'].fillna(missing_token)

df["essay1_len"] = df['project_essay_1'].apply(len)
df["essay2_len"] = df['project_essay_2'].apply(len)
df["essay3_len"] = df['project_essay_3'].apply(len)
df["essay4_len"] = df['project_essay_4'].apply(len)
df["title_len"] = df['project_title'].apply(len)

df['text'] = df.apply(lambda row: ' '.join([str(row['project_essay_1']), 
                                            str(row['project_essay_2']), 
                                            str(row['project_essay_3']), 
                                            str(row['project_essay_4'])]), axis=1)

In [11]:
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['char_count'] = df['text'].apply(len)
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['stopword_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
df['upper_case_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
df['title_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))

In [12]:
df['project_essay_1'] = df['project_essay_1'].apply(cleanup_text)
df['project_essay_2'] = df['project_essay_2'].apply(cleanup_text)
df['project_essay_3'] = df['project_essay_3'].apply(cleanup_text)
df['project_essay_4'] = df['project_essay_4'].apply(cleanup_text)
df['project_title'] = df['project_title'].apply(cleanup_text)
df['description'] = df['description'].apply(cleanup_text)
df['project_resource_summary'] = df['project_resource_summary'].apply(cleanup_text)

In [13]:
df["Year"] = df["project_submitted_datetime"].dt.year
df["Month"] = df["project_submitted_datetime"].dt.month
df['Weekday'] = df['project_submitted_datetime'].dt.weekday
df["Hour"] = df["project_submitted_datetime"].dt.hour
df["Month_Day"] = df['project_submitted_datetime'].dt.day
df["Year_Day"] = df['project_submitted_datetime'].dt.dayofyear

In [None]:
df['noun_count'] = df['temp_text'].apply(lambda x: pos_check(x, 'noun'))
df['verb_count'] = df['temp_text'].apply(lambda x: pos_check(x, 'verb'))
df['adj_count'] = df['temp_text'].apply(lambda x: pos_check(x, 'adj'))
df['adv_count'] = df['temp_text'].apply(lambda x: pos_check(x, 'adv'))
df['pron_count'] = df['temp_text'].apply(lambda x: pos_check(x, 'pron'))

In [None]:
df['sent_polarity'] = df['temp_text'].apply(lambda x: get_polarity(x))
df['sent_subjectivity'] = df['temp_text'].apply(lambda x: get_subjectivity(x))

In [None]:
df['article_text'] = df.apply(lambda row: ' '.join([str(row['project_title']), 
                                            str(row['project_essay_1']), 
                                            str(row['project_essay_2']), 
                                            str(row['project_essay_3']), 
                                            str(row['project_essay_4'])]), axis=1)


df['resource_text'] = df.apply(lambda row: ' '.join([str(row['description']), 
                                            str(row['project_resource_summary'])]), axis=1)


complete_text = df['text']
title_text = df['project_title']
resource_text = df['resource_text']

vect_word = TfidfVectorizer(max_features=8000, analyzer='word', stop_words='english', ngram_range=(1,3), dtype=np.float32) 
vect_word.fit(complete_text[traindex])
tfidf_complete = vect_word.transform(complete_text[traindex])

vect_word = TfidfVectorizer(max_features=3000, analyzer='word', stop_words='english', ngram_range=(1,3), dtype=np.float32) 
vect_word.fit(title_text[traindex])
tfidf_title = vect_word.transform(title_text[traindex])

vect_word = TfidfVectorizer(max_features=4000, analyzer='word', stop_words='english', ngram_range=(1,3), dtype=np.float32) 
vect_word.fit(resource_text[traindex])
tfidf_resource = vect_word.transform(resource_text[traindex])


char_word = TfidfVectorizer(max_features=8000, analyzer='char', stop_words='english', ngram_range=(1,3), dtype=np.float32) 
char_word.fit(complete_text[traindex])
tfidf_complete = char_word.transform(complete_text[traindex])

char_word = TfidfVectorizer(max_features=3000, analyzer='char', stop_words='english', ngram_range=(1,3), dtype=np.float32) 
char_word.fit(title_text[traindex])
tfidf_title = char_word.transform(title_text[traindex])

char_word = TfidfVectorizer(max_features=4000, analyzer='char', stop_words='english', ngram_range=(1,3), dtype=np.float32) 
char_word.fit(resource_text[traindex])
tfidf_resource = char_word.transform(resource_text[traindex])

In [None]:
xtrain = train.project_title.values
xtest = test.project_title.values

print ("load the fast text vectors in a dictionary")
embeddings_index = {}
count = 0

f = open('data/wiki-news-300d-1M.vec', encoding="utf8")
for line in tqdm(f):
    count += 1 
    if count == 10:
      break
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

print ("keras preprocessing")
token = text.Tokenizer(num_words=100000)
max_len = 300

token.fit_on_texts(list(xtrain))
xtrain_seq = token.texts_to_sequences(xtrain)
xtest_seq = token.texts_to_sequences(xtest)

print ("zero pad the sequences")
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xtest_pad = sequence.pad_sequences(xtest_seq, maxlen=max_len)

word_index = token.word_index

print ("create an embedding matrix for the words we have in the dataset")
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
cvectorizer = CountVectorizer(min_df=4,
                              max_features=180000,
                              tokenizer=tokenize,
                              ngram_range=(1,2))
cvz = cvectorizer.fit_transform(combined_sample['item_description'])
lda_model = LatentDirichletAllocation(n_components=20,
                                      learning_method='online',
                                      max_iter=20,
                                      random_state=42)
X_topics = lda_model.fit_transform(cvz)

In [None]:
n_top_words = 10
topic_summaries = []

topic_word = lda_model.components_  # get the topic words
vocab = cvectorizer.get_feature_names()

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' | '.join(topic_words)))

In [None]:
# Univariate Analysis

In [None]:
# target variable - is_approved

is_approved = df["project_is_approved"].value_counts()
labels = (np.array(is_approved.index))
sizes = (np.array((is_approved / approval_dist.sum())*100))

trace = go.Bar(x=labels, y=sizes, marker=dict(color=['green', 'red']))
layout = go.Layout(title = "Project Approval Distribution", width = 600, height=500)
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)

In [None]:
# target variable - times

def get_label_sizes(colname):
    dist = df[colname].value_counts()
    labels = (np.array([str(_) for _ in list(dist.index)]))
    sizes = (np.array((dist / dist.sum())*100))
    trace = go.Bar(x=labels, y=sizes)
    return trace
trace1 = get_label_sizes('Year')
trace2 = get_label_sizes('Month')
trace3 = get_label_sizes('Weekday')
trace4 = get_label_sizes('Hour')
trace5 = get_label_sizes('Month_Day')
trace6 = get_label_sizes('Year_Day')


fig = tools.make_subplots(rows=3, cols=2)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)
fig.append_trace(trace5, 3, 1)
fig.append_trace(trace6, 3, 2)

fig['layout'].update(height=800, width=1000, title='')
py.iplot(fig, filename='simple-subplot')

In [None]:
# target variable - teachers

def get_label_sizes(colname):
    dist = df[colname].value_counts()
    labels = (np.array([str(_) for _ in list(dist.index)]))
    sizes = (np.array((dist / dist.sum())*100))
    trace = go.Bar(x=labels, y=sizes)
    return trace
trace1 = get_label_sizes('teacher_prefix')
trace2 = get_label_sizes('school_state')
# trace3 = get_label_sizes('teacher_prefix')
trace4 = get_label_sizes('project_grade_category')
trace5 = get_label_sizes('project_subject_categories')
trace6 = get_label_sizes('project_subject_subcategories')

fig = tools.make_subplots(rows=3, cols=2)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
# fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)
fig.append_trace(trace5, 3, 1)
fig.append_trace(trace6, 3, 2)

fig['layout'].update(height=800, width=1000, title='')
py.iplot(fig, filename='simple-subplot')

In [None]:
trace1 = go.Histogram(x=df["price"], nbinsx = 50, opacity=0.75)
trace2 = go.Histogram(x=np.log(df["price"]), nbinsx = 50, opacity=0.75)
trace3 = go.Histogram(x=df["quantity"], nbinsx = 50, opacity=0.75)
trace4 = go.Histogram(x=np.log(df["quantity"]), nbinsx = 50, opacity=0.75)

fig = tools.make_subplots(rows=2, cols=2)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)

fig['layout'].update(height=800, width=1000, title='')
py.iplot(fig, filename='simple-subplot')

In [None]:
from collections import Counter

def generate_wordcloud(tup):
    wordcloud = WordCloud(background_color='white', max_words=50, max_font_size=40, 
                        random_state=42).generate(str(tup))
    return wordcloud


article1 = Counter(df['project_essay_1']).most_common(100)
article2 = Counter(df['project_essay_2']).most_common(100)
article3 = Counter(df['project_essay_3']).most_common(100)
article4 = Counter(df['project_essay_4']).most_common(100)
article5 = Counter(df['project_title']).most_common(100)
article6 = Counter(df['description']).most_common(100)

fig, axes = plt.subplots(3, 2, figsize=(30, 15))

ax = axes[0, 0]
ax.imshow(generate_wordcloud(article1), interpolation="bilinear")
ax.axis('off')
ax.set_title("article1", fontsize=30)

ax = axes[0, 1]
ax.imshow(generate_wordcloud(article2))
ax.axis('off')
ax.set_title("article2", fontsize=30)

ax = axes[1, 0]
ax.imshow(generate_wordcloud(article3))
ax.axis('off')
ax.set_title("article3", fontsize=30)

ax = axes[1, 1]
ax.imshow(generate_wordcloud(article4))
ax.axis('off')
ax.set_title("article4", fontsize=30)

ax = axes[2, 0]
ax.imshow(generate_wordcloud(article5))
ax.axis('off')
ax.set_title("title", fontsize=30)

ax = axes[2, 1]
ax.imshow(generate_wordcloud(article6))
ax.axis('off')
ax.set_title("description", fontsize=30)

In [None]:
## bivariate analysis 


### Stacked Bar Chart ###
def create_stack_bar_data(col):
    x_values = df[col].value_counts().index.tolist()
    y0_values = []
    y1_values = []
    for val in x_values:
        y1_values.append(np.sum(df["project_is_approved"][df[col]==val] == 1))
        y0_values.append(np.sum(df["project_is_approved"][df[col]==val] == 0))
    trace1 = go.Bar(x = x_values, y = y1_values, name='Accepted Proposals')
    trace2 = go.Bar(x = x_values, y = y0_values, name='Rejected Proposals')
    layout = go.Layout(title = "Project Grade Distribution", barmode='group', width = 1000)
    data = [trace1, trace2]
    return data, layout

In [None]:
data1, layout1 = create_stack_bar_data('Year')
fig = go.Figure(data=data1, layout=layout1)
py.iplot(fig)

In [None]:
data2, layout2 = create_stack_bar_data('Month')
fig = go.Figure(data=data2, layout=layout2)
py.iplot(fig, filename='ProjectGradeCategory')

In [None]:
data3, layout3 = create_stack_bar_data('Weekday')
fig = go.Figure(data=data3, layout=layout3)
py.iplot(fig, filename='ProjectGradeCategory')

In [None]:
data4, layout4 = create_stack_bar_data('Hour')
fig = go.Figure(data=data4, layout=layout4)
py.iplot(fig, filename='ProjectGradeCategory')

In [None]:
data5, layout5 = create_stack_bar_data('Month_Day')
fig = go.Figure(data=data5, layout=layout5)
py.iplot(fig, filename='ProjectGradeCategory')

In [None]:
data6, layout6 = create_stack_bar_data('Year_Day')
fig = go.Figure(data=data6, layout=layout6)
py.iplot(fig, filename='ProjectGradeCategory')

In [None]:
data6, layout6 = create_stack_bar_data('teacher_prefix')
fig = go.Figure(data=data6, layout=layout6)
py.iplot(fig, filename='ProjectGradeCategory')

data6, layout6 = create_stack_bar_data('school_state')
fig = go.Figure(data=data6, layout=layout6)
py.iplot(fig, filename='ProjectGradeCategory')

data6, layout6 = create_stack_bar_data('project_grade_category')
fig = go.Figure(data=data6, layout=layout6)
py.iplot(fig, filename='ProjectGradeCategory')

data6, layout6 = create_stack_bar_data('project_subject_categories')
fig = go.Figure(data=data6, layout=layout6)
py.iplot(fig, filename='ProjectGradeCategory')

data6, layout6 = create_stack_bar_data('project_subject_subcategories')
fig = go.Figure(data=data6, layout=layout6)
py.iplot(fig, filename='ProjectGradeCategory')