In [1]:
import csv
import string
import numpy as np
import math
from nltk.corpus import stopwords

In [2]:
words = {}
wordIndex = 0
punctuations = string.punctuation + '\n'

In [3]:
stop = set(stopwords.words('english'))

In [4]:
def getProperSentence(sentence):
    returnSentence = sentence.lower()
    replace_punctuation = string.maketrans(punctuations, ' '*len(punctuations))
    returnSentence = returnSentence.translate(replace_punctuation)
    return returnSentence

In [5]:
messageList = []
cleanedMessageList = []

In [6]:
def populateMessageList():
    with open('train.csv','rb') as train:
        reader = csv.reader(train)
        for row in reader:
            message =row[4]
            messageList.append(message)
            
populateMessageList()

def populateCleanedMessageList():
    with open('train.csv','rb') as train:
        reader = csv.reader(train)
        for row in reader:
            message =row[4]
            cleanedMessageList.append(getProperSentence(message))
            
populateCleanedMessageList()

In [7]:
wordsIndex = {}
wordCount = 0
wordCountForIdf = {}

In [8]:
def populateWordsIndex():
    global wordCount
    for message in cleanedMessageList:
        for word in message.split(' '):
            if not word.isdigit():
                if word not in wordsIndex:
                    wordsIndex[word] = wordCount
                    wordCount += 1

populateWordsIndex()

In [9]:
def populateWordCountForIdf():
    for word in wordsIndex:
        count = 0
        for message in cleanedMessageList:
            if word in message:
                count += 1
        wordCountForIdf[word] = count
        
populateWordCountForIdf()

In [10]:
stopWordsRemovedCount = 0
stopWordsRemovedIndex = {}
for word in wordsIndex:
    if word not in stop:
        stopWordsRemovedIndex[word] = stopWordsRemovedCount
        stopWordsRemovedCount += 1


In [11]:
def tf(givenWord, message): #Assuming cleaned message
    count = 0
    for word in message.split(' '):
        if givenWord == word:
            count += 1.0 
    return count

def idf(givenWord):
    N = len(messageList)
    n = wordCountForIdf[givenWord]
    return math.log(N*1.0/(1+n))

In [12]:
def makeFeatureVector(sentence):
    tempSen = sentence
    retVal = np.zeros(wordCount)
    for word in tempSen.split(' '):
        if word.isdigit():
            continue
        indexOfWord = wordsIndex[word]
        retVal[indexOfWord] = tf(word,tempSen)*idf(word)
    return retVal

def makeFeatureVectorWithoutStopwords(sentence):
    tempSen = sentence
    retVal = np.zeros(stopWordsRemovedCount)
    for word in tempSen.split(' '):
        if word.isdigit() or word in stop:
            continue
        indexOfWord = stopWordsRemovedIndex[word]
        retVal[indexOfWord] = tf(word,tempSen)*idf(word)
    return retVal
    

In [13]:
def returnFeatureSet(makeFeatureFunction):
    index = 0
    ret = np.array([])
    for message in cleanedMessageList:        
        featureVector = makeFeatureFunction(message)
#             print featureVector
        if index == 0:
            ret = np.hstack((ret,featureVector))
            index = index + 1
        else:
            ret = np.vstack((ret, featureVector))
    return ret

In [14]:
def returnCovarianceMatrix(makeFeatureFunction, sizeOfMatrix):
    ret = np.zeros([sizeOfMatrix, sizeOfMatrix])
    for message in cleanedMessageList:        
        featureVector = makeFeatureFunction(message)
        b = np.array([featureVector])
        ret += b.T * b
    return ret*1.0/len(cleanedMessageList)

In [15]:
featureVectorSet = returnFeatureSet(makeFeatureVector)
stopwordLessFeatureVectorSet = returnFeatureSet(makeFeatureVectorWithoutStopwords)

In [16]:
from sklearn.cluster import KMeans

In [17]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(stopwordLessFeatureVectorSet)

In [18]:
def getClusters(clusterIndices, messageList, requiredIndex):
    retVal = []
    for i in range(len(clusterIndices)):
        if clusterIndices[i] == requiredIndex:
            retVal.append(messageList[i])
    return retVal

In [19]:
getClusters(kmeans.labels_,cleanedMessageList,3)

['dear krishna deepak  thanks for booking with us  reach your oyo  reach jss hospital main gate  head east  16 m  turn left onto mg road and pass by hotel malgudi on the left for 160 m  then take a u turn at shankar mutt road and you will spot oyo on your left  28m    map link  http   bit ly mys028 hotel reception contact  04038416830']

In [20]:
[(i,len(getClusters(kmeans.labels_,messageList,i))) for i in range(20)]

[(0, 1),
 (1, 13),
 (2, 384),
 (3, 1),
 (4, 2),
 (5, 4),
 (6, 2),
 (7, 11),
 (8, 12),
 (9, 38),
 (10, 4),
 (11, 31),
 (12, 2),
 (13, 2),
 (14, 16),
 (15, 15),
 (16, 17),
 (17, 1),
 (18, 3),
 (19, 71)]

In [22]:
covarianceMatrix = returnCovarianceMatrix(makeFeatureVector,wordCount)

In [28]:
s.shape


(2229,)

In [26]:
U, s, V = np.linalg.svd(covarianceMatrix)

In [32]:
def getPCAColumnCount(requiredPercentage, s):
    total = 0.0
    for i in range(len(s)):
        total += s[i]**2
    sumTillNow = 0.0
    for i in range(len(s)):
        sumTillNow += s[i]**2
        if(sumTillNow/total >= requiredPercentage):
            return i + 1


In [34]:
getPCAColumnCount(0.9,s)

104

In [65]:
def getFeaturesAfterPCA(requiredPercentage, initialFeatures, s, U):
    numberOfColumns = getPCAColumnCount(requiredPercentage, s)
    uMatrix = U[:, 0:numberOfColumns].T
    featuresT = initialFeatures.T
#     print uMatrix.shape, featuresT.shape
    return (np.matrix(uMatrix)*np.matrix(featuresT)).T
    

In [66]:
featuresAfterPCA = getFeaturesAfterPCA(0.90, featureVectorSet, s, U)

In [69]:
featuresAfterPCA = np.array(featuresAfterPCA)

In [82]:
kmeansPCAVersion = KMeans(n_clusters=30, random_state=0).fit(featuresAfterPCA)

In [85]:
getClusters(kmeansPCAVersion.labels_,cleanedMessageList,29)

['buy till sunday   get extra 300 500 off over  above all offers  www lenskart com cust or visit store 1st time ever 300 off  1st frame free  500 off  2 for 2500',
 'nitish chandra has requested rs 5000 0 from you  click on this link to pay  http   m p y tm stom',
 'faasos app code   4893   for verifying your phone number  not some secret treasure ',
 'faasos app code   1752   for verifying your phone number  not some secret treasure ',
 'faasos app code   4893   for verifying your phone number  not some secret treasure ',
 'flat 50  cashback on mirchi live with vishal   shekhar concert tickets  use code  mirchi50  get upto rs 1 000 cashback  book on paytm http   m p y tm mrch  ',
 'pantaloons fashion fiesta  get rs  500 gift coupon on shopping for rs 3000  also get 15  cashback with freecharge   mobikwik  for tc click http   bit ly 2fhwyik',
 'canara bank invites you to mega retail expo at nehru centre exh hall 1 on 12 13 nov 16 from 10am to 7pm  top builders car dealers will particip

In [84]:
[(i,len(getClusters(kmeansPCAVersion.labels_,messageList,i))) for i in range(30)]

[(0, 7),
 (1, 34),
 (2, 5),
 (3, 2),
 (4, 7),
 (5, 38),
 (6, 19),
 (7, 1),
 (8, 18),
 (9, 2),
 (10, 2),
 (11, 18),
 (12, 4),
 (13, 1),
 (14, 6),
 (15, 17),
 (16, 4),
 (17, 11),
 (18, 47),
 (19, 1),
 (20, 18),
 (21, 1),
 (22, 3),
 (23, 30),
 (24, 12),
 (25, 14),
 (26, 2),
 (27, 6),
 (28, 11),
 (29, 289)]