In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
from tqdm import tqdm_notebook as tqdm
from prettytable import PrettyTable
import os

from plotly import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()

## $1.1$ Reading Data

In [None]:
#Data taken from Kaggle: https://www.kaggle.com/manasvee1/donorschooseorg-application-screening

project_data = pd.read_csv('../input/train.csv')
resource_data = pd.read_csv('../input/resources.csv')

In [None]:
# how to replace elements in list python: https://stackoverflow.com/a/2582163/4084039
cols = ['Date' if x=='project_submitted_datetime' else x for x in list(project_data.columns)]

#sort dataframe based on time pandas python: https://stackoverflow.com/a/49702492/4084039
project_data['Date'] = pd.to_datetime(project_data['project_submitted_datetime'])
project_data.drop('project_submitted_datetime', axis=1, inplace=True)
project_data.sort_values(by=['Date'], inplace=True)

# how to reorder columns pandas python: https://stackoverflow.com/a/13148611/4084039
project_data = project_data[cols]

project_data.head(2)

## $1.2a$ preprocessing of `project_subject_categories`

In [None]:
catogories = list(project_data['project_subject_categories'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
cat_list = []
for i in catogories:
    temp = ""
    # consider we have text like this "Math & Science, Warmth, Care & Hunger"
    for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
        if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
            j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
        j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
        temp+=j.strip()+" " #" abc ".strip() will return "abc", remove the trailing spaces
        temp = temp.replace('&','_') # we are replacing the & value into 
    cat_list.append(temp.strip())
    
project_data['clean_categories'] = cat_list
project_data.drop(['project_subject_categories'], axis=1, inplace=True)

## $1.2b$ preprocessing of `project_subject_subcategories`


In [None]:
sub_catogories = list(project_data['project_subject_subcategories'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python

sub_cat_list = []
for i in sub_catogories:
    temp = ""
    # consider we have text like this "Math & Science, Warmth, Care & Hunger"
    for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
        if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
            j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
        j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
        temp +=j.strip()+" "#" abc ".strip() will return "abc", remove the trailing spaces
        temp = temp.replace('&','_')
    sub_cat_list.append(temp.strip())

project_data['clean_subcategories'] = sub_cat_list
project_data.drop(['project_subject_subcategories'], axis=1, inplace=True)

## $1.2c$ preprocessing of `project_grade_category`

In [None]:
proj_grade_cat = []

for i in range(len(project_data)):
    pgc = project_data["project_grade_category"][i].replace(" ", "_")
    proj_grade_cat.append(pgc)
    
project_data.drop(['project_grade_category'], axis=1, inplace=True)
project_data["project_grade_category"] = proj_grade_cat

## $1.3$ Text preprocessing

In [None]:
# merge two column text dataframe: 
project_data["essay"] = project_data["project_essay_1"].map(str) +\
                        project_data["project_essay_2"].map(str) + \
                        project_data["project_essay_3"].map(str) + \
                        project_data["project_essay_4"].map(str)

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [None]:
# https://stackoverflow.com/a/47091490/4084039

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:


def getProcessedData(txt_type, working_data):
    preprocessed_data = []
    # tqdm is for printing the status bar
    
    for sentance in tqdm(working_data[txt_type].values):
        sent = decontracted(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\"', ' ')
        sent = sent.replace('\\n', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
        preprocessed_data.append(sent.lower().strip())
        
    return preprocessed_data

<h2><font color='red'> $1.4$ Preprocessing of `project_title`</font></h2>

In [None]:
## Covered Above ...

## $1.5$ Preparing data for models
we are going to consider

       - school_state : categorical data
       - clean_categories : categorical data
       - clean_subcategories : categorical data
       - project_grade_category : categorical data
       - teacher_prefix : categorical data
       
       - project_title : text data
       - text : text data
       - project_resource_summary: text data (optinal)
       
       - quantity : numerical (optinal)
       - teacher_number_of_previously_posted_projects : numerical
       - price : numerical

### $1.5.1$ Vectorizing Categorical data

In [None]:
def getCountDict(cat_type):
    count_dict = {}
    info_list = project_data[cat_type]
    project_data.loc[project_data[cat_type].isnull(), cat_type] = 'nan'
    
    for phrase in info_list:
        for data in phrase.split():
            if data not in count_dict: count_dict[data] = 0
            #elif data not in ['nan', np.nan]:
            else:
                count_dict[data] += 1
            
    return dict(sorted(count_dict.items(), key=lambda x: x[1]))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def getFitCAT_Vectorizer(working_data, cat_type, hstack_features):
    '''
    Fit on only train data.
    '''
    working_data.loc[working_data[cat_type].isnull(), cat_type] = 'nan'
    #print (working_data.keys())
    
    if 1:
        sorted_cat_dict = getCountDict(cat_type)
        print ('Keys...', sorted_cat_dict.keys())
        hstack_features += sorted_cat_dict.keys()
        vectorizer = CountVectorizer(vocabulary=sorted_cat_dict.keys(), lowercase=False, binary=True)
    
    vectorizer.fit(working_data[cat_type].values)
    return vectorizer
    
def getVectorizeCategData(working_data, cat_type, data_type):
    working_data.loc[working_data[cat_type].isnull(), cat_type] = 'nan'
    
    categories_one_hot = vectorizer.transform(working_data[cat_type].values)
    #print(vectorizer.get_feature_names())
    print("Shape of matrix after one hot encodig ",categories_one_hot.shape)
    
    return categories_one_hot

### $1.5.2$ Vectorizing Text data

#### $1.5.2.1$ Bag of words

In [None]:
def getFitBOW_Vectorizer(preprocessed_data):
    vectorizer = CountVectorizer(min_df=10)
    vectorizer.fit(preprocessed_data)
    
    return vectorizer

def getBOWVectorizeTxtData(preprocessed_data, vectorizer):
    text_bow = vectorizer.transform(preprocessed_data)
    print("Shape of matrix after one hot encodig ",text_bow.shape)
    
    return text_bow

#### 1.5.2.2 TFIDF vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def getFitTFIDF_Vectorizer(preprocessed_data):
    vectorizer = TfidfVectorizer(min_df=10)
    vectorizer.fit(preprocessed_data)
    return vectorizer

def getTFIDFVectorizeTxtData(preprocessed_data, vectorizer):
    text_tfidf = vectorizer.transform(preprocessed_data)
    print("Shape of matrix after one hot encodig ",text_tfidf.shape)
    return text_tfidf

### $1.5.3$ Vectorizing Numerical features


In [None]:
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
project_data = pd.merge(project_data, price_data, on='id', how='left')

In [None]:
from sklearn.preprocessing import Normalizer
import warnings 
warnings.filterwarnings("ignore") 

def getFitNUM_Vectorizer(working_data, num_type):
    '''
    Fit on only train data.
    '''
    
    num_scalar = Normalizer()
    num_scalar.fit(working_data[num_type].values.reshape(-1,1)) # finding the mean and standard deviation of this data
    return num_scalar

def getNUM_Vectors(working_data, num_type, num_scalar):
    # Now standardize the data with above maen and variance.
    num_standardized = num_scalar.transform(working_data[num_type].values.reshape(-1, 1))
    #print(f"Mean : {num_scalar.mean_[0]}, Standard deviation : {np.sqrt(num_scalar.var_[0])}")
    return num_standardized

### $1.5.4$ Merging all the above features

In [None]:
from scipy.sparse import hstack

def getMergedFeatures(working_data, merge_on):
    valid_cols = []
    for key, value in working_data.items():
        if key in merge_on:
            valid_cols.append(value)
   
    return hstack(tuple(valid_cols))

# Assignment $10$: Clustering

- <font color='red'>step 1</font>: Choose any vectorizer (data matrix) that you have worked in any of the assignments, and got the best AUC value.
- <font color='red'>step 2</font>: Choose any of the <a href='https://scikit-learn.org/stable/modules/feature_selection.html'>feature selection</a>/<a href='https://scikit-learn.org/stable/modules/decomposition.html'>reduction algorithms</a> ex: selectkbest features, pretrained word vectors, model based feature selection etc and reduce the number of features to 5k features
- <font color='red'>step 3</font>: Apply all three kmeans, Agglomerative clustering, DBSCAN
    - <strong>K-Means Clustering:</strong> <br>
        ● Find the best ‘k’ using the elbow-knee method (plot k vs inertia_)<br>
    - <strong>Agglomerative Clustering: </strong><br>
        ● Apply <a href='https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/'>agglomerative algorithm</a> and try a different number of clusters like 2,5 etc. <br>
        ● You can take less data points (as this is very computationally expensive one) to perform hierarchical clustering because they do take a considerable amount of time to run. <br>
    - <strong>DBSCAN Clustering: </strong><br>
        ● Find the best ‘eps’ using the <a href='https://stackoverflow.com/a/48558030/4084039'>elbow-knee method</a>.<br>
        ● You can take a smaller sample size for this as well.
- <font color='red'>step 4</font>: Summarize each cluster by manually observing few points from each cluster.
- <font color='red'>step 5</font>: You need to plot the word cloud with essay text for each cluster for each of algorithms mentioned in <font color='red'>step 3</font>.

<h1>$2.$ Clustering </h1>

<h2>$2.1$ Choose the best data matrix on which you got the best AUC</h2>

In [None]:
#Classes of X & project_data have almost same proportion.
X = project_data[:10000]

y = X['project_is_approved']

# BOW matrix gave the best AUC value in NB Machine Learning model.
set1_cols = ['school_state','clean_categories', 'clean_subcategories', 'project_grade_category', 'teacher_prefix',
             'price', 'teacher_number_of_previously_posted_projects', 
             'essay_text_bow', 'project_title_text_bow']

plt_title1 = 'BOW'

<h2>$2.2$ Make Data Model Ready: encoding numerical, categorical features</h2>

In [None]:
# Ordered dict will be used to ensure one to one correspondence between datapoints features and hstack_features.

from collections import OrderedDict

In [None]:
data_dict = OrderedDict({})
cols_dict = OrderedDict({'cat_cols': ['school_state','clean_categories', 'clean_subcategories', 'project_grade_category', 'teacher_prefix'],
                 'num_cols': ['price', 'teacher_number_of_previously_posted_projects']
            })
hstack_features = []

for col_type, cols_name in cols_dict.items():
    if col_type == 'cat_cols':
        for cat_type in cols_name:
            print (cat_type)
            vectorizer = getFitCAT_Vectorizer(X, cat_type, hstack_features)
            hot_encode = getVectorizeCategData(X, cat_type, vectorizer)
            data_dict[cat_type] = hot_encode
    else:
        for num_type in cols_name:
            vectorizer = getFitNUM_Vectorizer(X, num_type)
            hstack_features.append(num_type)
            num_vectors = getNUM_Vectors(X, num_type, vectorizer)
            data_dict[num_type] = num_vectors
        

<h2>$2.3$ Make Data Model Ready: encoding eassay, and project_title</h2>

In [None]:
for col_type in ['essay','project_title']:
    preprocessed_data = getProcessedData(col_type, X)
    vectorizer_bog = getFitBOW_Vectorizer(preprocessed_data)
    text_bow = getBOWVectorizeTxtData(preprocessed_data, vectorizer_bog)
    data_dict['%s_text_bow'%col_type] = text_bow
    
    if col_type == "essay":
        essay_hot_info = (vectorizer_bog.get_feature_names(), text_bow.toarray())
           
    hstack_features += vectorizer_bog.get_feature_names()

<h2>$2.4$ Dimensionality Reduction on the selected features & defining Wordcloud functions </h2>

In [None]:
# Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
# Reference: chi2 isn't working due to negative values; used https://stackoverflow.com/questions/25792012/feature-selection-using-scikit-learn

from sklearn.feature_selection import SelectKBest, f_classif 

data_matrix = getMergedFeatures(data_dict, set1_cols)
print (data_matrix.shape, len(hstack_features))

selector = SelectKBest(f_classif, k=5000)
X_new = selector.fit_transform(data_matrix, y)

In [None]:
from wordcloud import WordCloud

def getCorupusDict(essay_hot_info, y_pred):
    one_hot_featr, one_hot_enc = essay_hot_info
    one_hot_enc_cols = one_hot_enc.shape[1]
    corpus_dict = {}
    i = 0
    for each_x in tqdm(y_pred):
        if each_x not in corpus_dict: corpus_dict[each_x] = ''
        for j in range(one_hot_enc_cols):
            if one_hot_enc[i][j] >= 0.5:
                corpus_dict[each_x] = "%s %s"%(corpus_dict[each_x], one_hot_featr[j].strip())
        i += 1
    
    return corpus_dict

def plotWordCloud(word_corpus, i, algo_title):
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    stopwords = stopwords,
                    collocations = False,
                    min_font_size = 10).generate(word_corpus) 
    
    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.title('The word cloud with essay text for cluster no. %s for algorithm- %s'%(i, algo_title))
    #plt.tight_layout(pad = 0) 

    plt.show() 

<h2>$2.5$ Apply Kmeans</h2>

$\rightarrow$ Working on $10k$ data points.

In [None]:
from sklearn.cluster import KMeans
def getLosses(hypers, data_matrix):
    obj_loss = []
    
    for k in hypers:
        kmeans = KMeans(n_clusters=k, random_state=0, n_jobs=-1).fit(data_matrix)
        obj_loss.append(kmeans.inertia_)
        
    return obj_loss

def plotGraph(hypers, obj_loss):
    plt.plot(hypers, obj_loss, label='knee')
    
    plt.title("Plot to find best K using elbow-knee method")
    plt.xlabel('number of clusters (K)')
    plt.ylabel('loss value')
    plt.legend()
    

In [None]:
algo_title = 'Kmeans' 
hypers = [4, 9, 16, 25, 36]
losses = getLosses(hypers, X_new)
plotGraph(hypers, losses)

Selecting total clusters $=15$ after analysing the plot. **Once selected, plotting wordcloud for each cluser.**

In [None]:
kmeans = KMeans(n_clusters=15, random_state=0, n_jobs=-1).fit(X_new)
corpus_dict = getCorupusDict(essay_hot_info, kmeans.labels_)
corpus_dict = dict(sorted(list(corpus_dict.items()), key=lambda x: x[0]))
for key, val in corpus_dict.items():
    plotWordCloud(val, key, algo_title)

**Observations** - 

In [None]:
from prettytable import PrettyTable
from collections import Counter

table = PrettyTable()
table.field_names = ["Cluster No", "No. of words in cluster", "Most frequent words"]

for key, val in corpus_dict.items():
    freq_dict = dict(sorted(list(Counter(val.split()).items()), key=lambda x: x[1], reverse=True))
    table.add_row([key, len(val.split()), ",".join(list(freq_dict.keys())[:10])])

print (table) 

<h2>$2.6$ Apply AgglomerativeClustering</h2>

$\rightarrow$ Working on $5k$ data points.

In [None]:
import scipy.cluster.hierarchy as shc

X_new_agg = X_new.todense()[:5000]

algo_title = 'Agglomerative Clustering'
plt.figure(figsize=(10, 7))  
plt.title("Dendogram")  
dend = shc.dendrogram(shc.linkage(X_new_agg, method='ward'))  

Selecting total clusters $=3$ after analysing the dendogram. **Once selected, plotting wordcloud for each cluser.**

In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
cluster.fit_predict(X_new_agg)  

corpus_dict = getCorupusDict(essay_hot_info, cluster.labels_)
corpus_dict = dict(sorted(list(corpus_dict.items()), key=lambda x: x[0]))
for key, val in corpus_dict.items():
    plotWordCloud(val, key, algo_title)

**Observations:**

In [None]:
table = PrettyTable()
table.field_names = ["Cluster No", "No. of words in cluster", "Most frequent words"]

for key, val in corpus_dict.items():
    freq_dict = dict(sorted(list(Counter(val.split()).items()), key=lambda x: x[1], reverse=True))
    table.add_row([key, len(val.split()), ",".join(list(freq_dict.keys())[:10])])

print (table) 

<h2>$2.7$ Apply DBSCAN</h2>

$\rightarrow$ Working on $5k$ data points.

Considering minPts $= log_e{\|size\ of\ data set\|} = \lfloor log_e(5000) \rfloor = 8$ (Reference [here](https://stackoverflow.com/questions/12893492/choosing-eps-and-minpts-for-dbscan-r/48558030#48558030))

In [None]:
from sklearn.neighbors import KDTree

algo_title = 'DBSCAN Clustering'
minPts = 8
tree = KDTree(X_new_agg)

idx = 0
epss = []
for x_i in tqdm(X_new_agg):
    epss.append(tree.query(X_new_agg[idx], return_distance=True, k=minPts)[0][0][-1])
    idx += 1
epss.sort()

plt.plot(range(0,5000), epss[:5000])
plt.title("Plot to find best eps using elbow-knee method")
plt.xlabel('Integers')
plt.ylabel('eps-values')
plt.legend('kneee')

Selecting optimal radius $\epsilon=19$ after analysing the plot. **Once selected, plotting wordcloud for each cluser.**

In [None]:
from sklearn.cluster import DBSCAN

cluster = DBSCAN(eps=19, min_samples=minPts).fit(X_new_agg)
corpus_dict = getCorupusDict(essay_hot_info, cluster.labels_)
print ("number of clusters gotten:", len(corpus_dict))

In [None]:
corpus_dict = dict(sorted(list(corpus_dict.items()), key=lambda x: x[0]))
for key, val in corpus_dict.items():
    plotWordCloud(val, key, algo_title)

**Observations:**

In [None]:
table = PrettyTable()
table.field_names = ["Cluster No", "No. of words in cluster", "Most frequent words"]

for key, val in corpus_dict.items():
    freq_dict = dict(sorted(list(Counter(val.split()).items()), key=lambda x: x[1], reverse=True))
    table.add_row([key, len(val.split()), ",".join(list(freq_dict.keys())[:10])])

print (table) 