In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from collections import Counter
import string
import math


In [2]:
corpus = pd.read_csv("sms.csv")
hdfc_corpus = corpus[corpus.sender.str.contains("HDFCBK")]

hdfc_corpus

Unnamed: 0,id,sender,message,timestamp
253,21,AM-HDFCBK,Balance in A/c XXXXXXXXXX0028 as of 16-OCT-16 ...,2016-10-17 02:30:53+00
261,21,AM-HDFCBK,"INR 2,50,000.00 deposited to A/c No XX0028 tow...",2016-10-17 07:00:42+00
265,21,AM-HDFCBK,"Dear AUTOMAXX, your Indent raised on 17-OCT-16...",2016-10-17 06:40:06+00
267,21,AM-HDFCBK,"INR 2,15,000.00 Dr to A/c No XX0028 towards MC...",2016-10-17 06:32:45+00
268,21,AM-HDFCBK,"INR 6,55,000.00 deposited to A/c No XX0028 tow...",2016-10-17 06:11:00+00
275,21,AM-HDFCBK,"INR 9,885.00 deposited to A/c No XX0028 toward...",2016-10-17 03:45:19+00
276,21,AM-HDFCBK,"An amount of Rs.441,150.00 has been debited fr...",2016-10-17 08:42:25+00
306,21,AM-HDFCBK,"An amount of Rs.200,000.00 has been debited fr...",2016-10-17 14:03:10+00
315,21,AM-HDFCBK,Balance in A/c XXXXXXXXXX0028 as of 17-OCT-16 ...,2016-10-18 04:37:06+00
323,21,AM-HDFCBK,"INR 44,094.37 deposited to A/c No XX0028 towar...",2016-10-18 04:05:09+00


In [3]:
# src: http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
def removeStopwords(sentence):
    
    sentence= re.sub('[^A-Za-z0-9]+', ' ', sentence)

    stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
    stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
    stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
    stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
    stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
    stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
    stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
    stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
    stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
    stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
    stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
    stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
    stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
    stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
    stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
    stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
    stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
    stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
    stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
    stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
    stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
    stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
    stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
    stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
    stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
    stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
    stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
    stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
    stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
    stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
    stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
    stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
    stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
    stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
    stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
    stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
    stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
    stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
    stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
    stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
    stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
    stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
    stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
    stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
    stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
    stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
    stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
    stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
    stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
    stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
    stopwords += ['yours', 'yourself', 'yourselves']
    return ' '.join([w for w in sentence.split() if w not in stopwords])

In [4]:
hdfc_corpus["clean_msg"] = hdfc_corpus.apply(lambda x: x["message"].lower(), axis = 1)

def clean_sms(string):
    sms = []
    regexp = r'^([^0-9]*)$'
    for i in nltk.word_tokenize(removeStopwords(str(string))):
        if re.match(regexp, i):
            word = nltk.PorterStemmer().stem(i.lower())
            sms += [word]
    return ' '.join(sms)

hdfc_corpus["clean_msg"] = hdfc_corpus["clean_msg"].apply(clean_sms)
hdfc_corpus = hdfc_corpus.reset_index()
hdfc_corpus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,index,id,sender,message,timestamp,clean_msg
0,253,21,AM-HDFCBK,Balance in A/c XXXXXXXXXX0028 as of 16-OCT-16 ...,2016-10-17 02:30:53+00,balanc c oct eod inr check c current balanc cr...
1,261,21,AM-HDFCBK,"INR 2,50,000.00 deposited to A/c No XX0028 tow...",2016-10-17 07:00:42+00,inr deposit c rtg cr od automaxx automaxx val ...
2,265,21,AM-HDFCBK,"Dear AUTOMAXX, your Indent raised on 17-OCT-16...",2016-10-17 06:40:06+00,dear automaxx indent rais oct process rs utr
3,267,21,AM-HDFCBK,"INR 2,15,000.00 Dr to A/c No XX0028 towards MC...",2016-10-17 06:32:45+00,inr dr c mc issu sagar garden val oct clr bal inr
4,268,21,AM-HDFCBK,"INR 6,55,000.00 deposited to A/c No XX0028 tow...",2016-10-17 06:11:00+00,inr deposit c documentari se val oct clr bal i...
5,275,21,AM-HDFCBK,"INR 9,885.00 deposited to A/c No XX0028 toward...",2016-10-17 03:45:19+00,inr deposit c card settl val oct clr bal inr s...
6,276,21,AM-HDFCBK,"An amount of Rs.441,150.00 has been debited fr...",2016-10-17 08:42:25+00,rs debit account number tpt txn use hdfc bank ...
7,306,21,AM-HDFCBK,"An amount of Rs.200,000.00 has been debited fr...",2016-10-17 14:03:10+00,rs debit account number tpt txn use hdfc bank ...
8,315,21,AM-HDFCBK,Balance in A/c XXXXXXXXXX0028 as of 17-OCT-16 ...,2016-10-18 04:37:06+00,balanc c oct eod inr check c current balanc cr...
9,323,21,AM-HDFCBK,"INR 44,094.37 deposited to A/c No XX0028 towar...",2016-10-18 04:05:09+00,inr deposit c card settl val oct clr bal inr s...


In [5]:
wordlist = Counter()
hdfc_corpus["clean_msg"].str.split().apply(wordlist.update)
wordlist

Counter({'abl': 1,
         'access': 6,
         'account': 77,
         'activ': 6,
         'ad': 2,
         'adequ': 1,
         'airtel': 2,
         'airtelmoney': 3,
         'al': 3,
         'amazon': 2,
         'amritsar': 1,
         'amt': 1,
         'anjani': 4,
         'anytim': 1,
         'app': 4,
         'appli': 5,
         'applic': 2,
         'appmb': 3,
         'apr': 14,
         'ashok': 2,
         'aug': 21,
         'authent': 5,
         'auto': 1,
         'automaxx': 6,
         'automobil': 2,
         'autopay': 2,
         'avail': 15,
         'avenu': 2,
         'avl': 66,
         'avoid': 4,
         'b': 1,
         'babu': 1,
         'bajafinemi': 2,
         'bal': 213,
         'balanc': 288,
         'bank': 109,
         'basi': 1,
         'batch': 1,
         'beneficiari': 9,
         'best': 1,
         'bf': 2,
         'bill': 2,
         'billpay': 7,
         'biradar': 1,
         'bit': 16,
         'br': 3,
         'branch

In [6]:
words = pd.DataFrame.from_dict(wordlist, orient='index').reset_index()
words = words.rename(columns={'index':'word', 0:'count'})
num_words = len(corpus)
words["idf"] = words.apply(lambda x: math.log10(num_words/(1 + x["count"])), axis=1)
words

Unnamed: 0,word,count,idf
0,know,5,3.572988
1,prompt,5,3.572988
2,mobilebank,6,3.506041
3,upi,1,4.050109
4,charg,6,3.506041
5,till,19,3.050109
6,loan,1,4.050109
7,non,5,3.572988
8,parsiya,1,4.050109
9,prakash,4,3.652169


In [7]:
def tf(s):
    d = dict(Counter(s))
    for k in d.keys():
        d[k] = d[k] / len(d)
    return d

In [8]:
hdfc_corpus["tf"] = hdfc_corpus.apply(lambda x: tf(x["clean_msg"].split()), axis=1)
hdfc_corpus

Unnamed: 0,index,id,sender,message,timestamp,clean_msg,tf
0,253,21,AM-HDFCBK,Balance in A/c XXXXXXXXXX0028 as of 16-OCT-16 ...,2016-10-17 02:30:53+00,balanc c oct eod inr check c current balanc cr...,"{'eod': 0.1, 'inr': 0.1, 'clear': 0.1, 'oct': ..."
1,261,21,AM-HDFCBK,"INR 2,50,000.00 deposited to A/c No XX0028 tow...",2016-10-17 07:00:42+00,inr deposit c rtg cr od automaxx automaxx val ...,"{'inr': 0.15384615384615385, 'clr': 0.07692307..."
2,265,21,AM-HDFCBK,"Dear AUTOMAXX, your Indent raised on 17-OCT-16...",2016-10-17 06:40:06+00,dear automaxx indent rais oct process rs utr,"{'rais': 0.125, 'rs': 0.125, 'oct': 0.125, 'au..."
3,267,21,AM-HDFCBK,"INR 2,15,000.00 Dr to A/c No XX0028 towards MC...",2016-10-17 06:32:45+00,inr dr c mc issu sagar garden val oct clr bal inr,"{'inr': 0.18181818181818182, 'mc': 0.090909090..."
4,268,21,AM-HDFCBK,"INR 6,55,000.00 deposited to A/c No XX0028 tow...",2016-10-17 06:11:00+00,inr deposit c documentari se val oct clr bal i...,"{'inr': 0.18181818181818182, 'clr': 0.09090909..."
5,275,21,AM-HDFCBK,"INR 9,885.00 deposited to A/c No XX0028 toward...",2016-10-17 03:45:19+00,inr deposit c card settl val oct clr bal inr s...,"{'inr': 0.18181818181818182, 'clr': 0.09090909..."
6,276,21,AM-HDFCBK,"An amount of Rs.441,150.00 has been debited fr...",2016-10-17 08:42:25+00,rs debit account number tpt txn use hdfc bank ...,"{'tpt': 0.1, 'use': 0.1, 'rs': 0.1, 'txn': 0.1..."
7,306,21,AM-HDFCBK,"An amount of Rs.200,000.00 has been debited fr...",2016-10-17 14:03:10+00,rs debit account number tpt txn use hdfc bank ...,"{'tpt': 0.1, 'use': 0.1, 'rs': 0.1, 'txn': 0.1..."
8,315,21,AM-HDFCBK,Balance in A/c XXXXXXXXXX0028 as of 17-OCT-16 ...,2016-10-18 04:37:06+00,balanc c oct eod inr check c current balanc cr...,"{'eod': 0.1, 'inr': 0.1, 'clear': 0.1, 'oct': ..."
9,323,21,AM-HDFCBK,"INR 44,094.37 deposited to A/c No XX0028 towar...",2016-10-18 04:05:09+00,inr deposit c card settl val oct clr bal inr s...,"{'inr': 0.18181818181818182, 'clr': 0.09090909..."


In [9]:
words = words.set_index("word")

In [12]:
def tfidf(tf):
    tfidf = {}
    for k in tf.keys():
        tfidf[k] = tf[k] * words.loc(k)[k].idf
    return tfidf

hdfc_corpus["tfidf"] = hdfc_corpus.apply(lambda x: tfidf(x["tf"]), axis=1)
hdfc_corpus

Unnamed: 0,index,id,sender,message,timestamp,clean_msg,tf,tfidf
0,253,21,AM-HDFCBK,Balance in A/c XXXXXXXXXX0028 as of 16-OCT-16 ...,2016-10-17 02:30:53+00,balanc c oct eod inr check c current balanc cr...,"{'eod': 0.448652912534, 'inr': 0.330189911881,...","{'eod': 0.950310958003, 'inr': 0.599992332654,..."
1,261,21,AM-HDFCBK,"INR 2,50,000.00 deposited to A/c No XX0028 tow...",2016-10-17 07:00:42+00,inr deposit c rtg cr od automaxx automaxx val ...,"{'inr': 0.507984479818, 'clr': 0.503629814388,...","{'inr': 0.92306512716, 'clr': 1.28866141219, '..."
2,265,21,AM-HDFCBK,"Dear AUTOMAXX, your Indent raised on 17-OCT-16...",2016-10-17 06:40:06+00,dear automaxx indent rais oct process rs utr,"{'rais': 1.87600164623, 'rs': 0.413315309911, ...","{'rais': 7.26766358997, 'rs': 0.75156618605, '..."
3,267,21,AM-HDFCBK,"INR 2,15,000.00 Dr to A/c No XX0028 towards MC...",2016-10-17 06:32:45+00,inr dr c mc issu sagar garden val oct clr bal inr,"{'inr': 0.60034529433, 'mc': 1.49121660105, 'i...","{'inr': 1.09089515028, 'mc': 6.03958972155, 'i..."
4,268,21,AM-HDFCBK,"INR 6,55,000.00 deposited to A/c No XX0028 tow...",2016-10-17 06:11:00+00,inr deposit c documentari se val oct clr bal i...,"{'inr': 0.60034529433, 'clr': 0.595198871549, ...","{'inr': 1.09089515028, 'clr': 1.52296348714, '..."
5,275,21,AM-HDFCBK,"INR 9,885.00 deposited to A/c No XX0028 toward...",2016-10-17 03:45:19+00,inr deposit c card settl val oct clr bal inr s...,"{'inr': 0.60034529433, 'clr': 0.595198871549, ...","{'inr': 1.09089515028, 'clr': 1.52296348714, '..."
6,276,21,AM-HDFCBK,"An amount of Rs.441,150.00 has been debited fr...",2016-10-17 08:42:25+00,rs debit account number tpt txn use hdfc bank ...,"{'tpt': 1.22923229227, 'use': 0.528108213105, ...","{'debit': 1.48695932113, 'tpt': 4.30973871511,..."
7,306,21,AM-HDFCBK,"An amount of Rs.200,000.00 has been debited fr...",2016-10-17 14:03:10+00,rs debit account number tpt txn use hdfc bank ...,"{'tpt': 1.22923229227, 'use': 0.528108213105, ...","{'debit': 1.48695932113, 'tpt': 4.30973871511,..."
8,315,21,AM-HDFCBK,Balance in A/c XXXXXXXXXX0028 as of 17-OCT-16 ...,2016-10-18 04:37:06+00,balanc c oct eod inr check c current balanc cr...,"{'eod': 0.448652912534, 'inr': 0.330189911881,...","{'eod': 0.950310958003, 'inr': 0.599992332654,..."
9,323,21,AM-HDFCBK,"INR 44,094.37 deposited to A/c No XX0028 towar...",2016-10-18 04:05:09+00,inr deposit c card settl val oct clr bal inr s...,"{'inr': 0.60034529433, 'clr': 0.595198871549, ...","{'inr': 1.09089515028, 'clr': 1.52296348714, '..."


In [17]:
size = len(hdfc_corpus)
sim = np.zeros(shape=(size, size))

for i in range(size):
    for j in range(size):
#for i in range(len(corpus)):
#   for j in range(len(corpus)):
        if i == j:
            sim[i][j] = 1.0
            #print(sim[i][j])
        elif i > j:
            sim[i][j] = sim[j][i]
        else:
            import ast
            
            x = hdfc_corpus.loc(i)[i].tfidf
            y = hdfc_corpus.loc(j)[j].tfidf
             
            unique_words = set()
                        
            for k in x.keys():
                unique_words.add(k)
            for k in y.keys():
                unique_words.add(k)
            
            n,d = 0.0,0.0
            
            for u in list(unique_words):
                n += x.get(u,0)*y.get(u,0)
                d += math.sqrt(math.pow(x.get(u,0),2) + math.pow(y.get(u,0),2))
            try:    
                sim[i][j] = n/d
            except:
                sim[i][j] = 0.0
            
sim

array([[ 1.        ,  0.15388488,  0.10778386, ...,  0.        ,
         0.        ,  0.00492351],
       [ 0.15388488,  1.        ,  0.68113776, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.10778386,  0.68113776,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         3.44023814,  0.0520627 ],
       [ 0.        ,  0.        ,  0.        , ...,  3.44023814,
         1.        ,  0.0520627 ],
       [ 0.00492351,  0.        ,  0.        , ...,  0.0520627 ,
         0.0520627 ,  1.        ]])

In [19]:
# np.save("sim_hdfc", sim)


In [27]:
thresh = 0.7
cluster_num = 0

cluster = {}

s = list(range(0,size))

while(s):
    cluster[cluster_num] = [s.pop(0)]
    for i in s:
        #print(sim[i][cluster[cluster_num]])
        if sim[i][cluster[cluster_num]][0] >= thresh:
            cluster[cluster_num] += [i]
            s.remove(i)
    for i in cluster[cluster_num]:
        for j in s:
            if sim[j][i] >= thresh:
                cluster[cluster_num] += [j]
                s.remove(j)
    cluster_num += 1

cluster

{0: [0, 8, 18, 27, 30, 37, 41, 44, 46, 53, 55, 491, 38],
 1: [1, 11, 2, 15, 14, 88, 275, 280, 336],
 2: [3],
 3: [4, 12],
 4: [5, 9, 68, 70, 81, 133, 136, 276, 300, 390, 424, 441],
 5: [6, 7, 173, 299, 368, 10],
 6: [13],
 7: [16, 353, 503],
 8: [17, 508, 498, 504, 487],
 9: [19, 40, 56, 63, 99],
 10: [20, 33, 251, 269, 454, 459, 252, 297, 258, 271, 125],
 11: [21],
 12: [22, 23, 25, 24, 26],
 13: [28],
 14: [29,
  43,
  54,
  60,
  65,
  106,
  109,
  113,
  128,
  151,
  241,
  261,
  334,
  339,
  344,
  356,
  364,
  378,
  414,
  429,
  435,
  446,
  453,
  469,
  472,
  474,
  484,
  335],
 15: [31],
 16: [32],
 17: [34,
  49,
  58,
  73,
  75,
  79,
  90,
  103,
  132,
  153,
  185,
  192,
  194,
  209,
  214,
  217,
  219,
  221,
  225,
  245,
  247,
  267,
  283,
  290,
  302,
  304,
  307,
  311,
  324,
  355,
  359,
  376,
  380,
  385,
  394,
  396,
  402,
  410,
  418,
  421,
  433,
  455,
  457,
  460,
  485,
  134,
  193,
  291,
  386,
  397,
  411,
  419,
  195],
 18: [

In [45]:
for c in list(cluster.keys()):
    print("\nCluster " + str(c) + " \n---\n")
    for i in cluster[c]:
        print(hdfc_corpus.loc(i)[i].message)


Cluster 0 
---

Balance in A/c XXXXXXXXXX0028 as of 16-OCT-16 EOD is INR 90,258.70 . Check A/c for current balance . Credits in A/c are subject to clearing
Balance in A/c XXXXXXXXXX0028 as of 17-OCT-16 EOD is INR 1,48,993.70 . Check A/c for current balance . Credits in A/c are subject to clearing
Balance in A/c XXXXXXXXXX0044 as of 20-OCT-16 EOD is INR 6,49,329.57 . Check A/c for current balance . Credits in A/c are subject to clearing
Balance in A/c XXXXXXXXXX0044 as of 19-OCT-16 EOD is INR 11,62,229.57 . Check A/c for current balance . Credits in A/c are subject to clearing
Balance in A/c XXXXXXXXXX0044 as of 18-OCT-16 EOD is INR 3,829.57 . Check A/c for current balance . Credits in A/c are subject to clearing
Balance in A/c XXXXXXXXXX0044 as of 16-OCT-16 EOD is INR 8,887.07 . Check A/c for current balance . Credits in A/c are subject to clearing
Balance in A/c XXXXXXXXXX0044 as of 08-OCT-16 EOD is INR 4,925.57 . Check A/c for current balance . Credits in A/c are subject to clearing