In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from stemming.porter2 import stem

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tag.perceptron import PerceptronTagger

from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# tagger = PerceptronTagger()
# tagset = None
# def remove_grammar(review):
#     sentences = nltk.sent_tokenize(review)
#     sentences = [nltk.word_tokenize(sent) for sent in sentences]
#     print('sentences',sentences)
#     result_review = []
#     for sentence in sentences:
#         tagged_review = nltk.tag._pos_tag(sentence, tagset, tagger)
#         cp = nltk.RegexpParser(grammar)
#         result = cp.parse(tagged_review)
#         result_review.append(traverseTree(result))
#     return ''.join([word for word in result_review])

# def clean(text):
#     text = text.lower()
#     string = ''
    
#     text=re.sub(r"[-()\"#!@$%^&*{}?.,:]"," ",text)
#     text=re.sub(r"\s+"," ",text)
#     text=re.sub('[^A-Za-z0-9]+',' ', text)
    
#     text = remove_grammar(text)
    
#     for word in text.split():
#         if word not in stopwords:
#             string += WordNetLemmatizer().lemmatize(word, pos='v')+ ' '
    
#     return string



In [4]:
# hi = 'hi im trying my best'

# print(clean(hi))

In [5]:
tagger = PerceptronTagger()
tagset = None
stop = nltk.corpus.stopwords
wordnet_lemmatizer = WordNetLemmatizer()

grammar = '''REMOVE: {<PRP><VBP>?<VBG><TO>?}
                         {<PRP><MD><VB><TO>}
                         {<VBZ><DT><JJ>}
                         {<MD><DT><NN>}
                         {<NNP><PRP><VBP>}
                         {<MD><PRP>}
                         {<NNP><PRP><VBP>}
                         {<WDT><MD>}
                         {<PRP><VBP><VBG><VB><DT>}
                         {<VBZ><DT><JJ>}
                         {<VBZ><EX><NN><PRP><VBP><TO><VB>}
                         {<DT><VBZ>}
                         {<PRP><VBP><VBG><TO>}
                         {<MD><VB><TO><VB>}
                         {<VBZ><EX><DT>}
                         {<VB><TO>}
                         {<VBZ>}
                         {<DT>}
                         {<EX>}
                         {<PRP><VBP>}
                         {<CD>}
                         {<PRP\$>}
                         {<PRP>}
                         {<TO>}
                         {<IN>}
                         {<VBP>}
                         {<CC>}
              '''

def stem_doc(x):
    red_text = [stem(word.strip()) for word in x.split(" ") if word.strip()!='']
    return ' '.join(red_text)

def lem(x):
    try:
        return wordnet_lemmatizer.lemmatize(x,pos='v')
    except:
        return x
        
def remove_url(x):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x)

def cleanse_text(text):
    if text:
        text = remove_url(text)
        addl_txt = addl_clean_words(text)
        red_text = clean_words(addl_txt)
        
        no_gram = red_text
        try:
            no_gram = remove_grammar(red_text)
        except:
            no_gram = red_text
    
        #clean = ' '.join([i for i in no_gram.split() if i not in stop])
        if no_gram:
            clean = ' '.join([i for i in no_gram.split()])
            red_text = [lem(word) for word in clean.split(" ")]
            red_text = [stem(word) for word in clean.split(" ")]
            return clean_words(' '.join(red_text))
        else:
            return no_gram
    else:
        return text

def cleanse_text_guided(text):
    if text:
        text = remove_url(text)
        addl_txt = addl_clean_words(text)
        red_text = clean_words_guided(addl_txt)
        
        no_gram = red_text
        try:
            no_gram = remove_grammar(red_text)
        except:
            no_gram = red_text
    
        #clean = ' '.join([i for i in no_gram.split() if i not in stop])
        if no_gram:
            clean = ' '.join([i for i in no_gram.split()])
            red_text = [lem(word) for word in clean.split(" ")]
            red_text = [stem(word) for word in clean.split(" ")]
            return clean_words(' '.join(red_text))
        else:
            return no_gram
    else:
        return text

        
def addl_clean_words(words):
    # any additional data pre-processing
    words = words.replace('can\'t','cannot')
    words = words.replace('won\'t','would not')
    words = words.replace('doesn\'t','does not')
    return words
    
def clean_words(words):
    if words:
        words = remove_email(words)
        words = words.replace('\t',' ')
        words = words.replace(',',' ')
        words = words.replace(':',' ')
        words = words.replace(';',' ')
        words = words.replace('=',' ')
        #words = words.replace('\x92','') # apostrophe encoding
        words = words.replace('\x08','\\b') # \b is being treated as backspace
        #words = ''.join([i for i in words if not i.isdigit()])
        words = words.replace('_',' ')
        words = words.replace('(',' ')
        words = words.replace(')',' ')
        words = words.replace('+',' ')
        words = words.replace('-',' ')
        words = words.replace('`',' ')
        words = words.replace('\'',' ')
        words = words.replace('.',' ')
        words = words.replace('#',' ')
        words = words.replace('/',' ')
        words = words.replace('_',' ')
        words = words.replace('"',' ')
        return words.strip()
    return words

def clean_words_guided(words):
    if words:
        words = remove_email(words)
        words = words.replace('\t',' ')
        words = words.replace(',',' ')
        words = words.replace(':',' ')
        words = words.replace(';',' ')
        words = words.replace('=',' ')
        #words = words.replace('\x92','') # apostrophe encoding
        words = words.replace('\x08','\\b') # \b is being treated as backspace
        #words = ''.join([i for i in words if not i.isdigit()])
        words = words.replace('_',' ')
        words = words.replace('(',' ')
        words = words.replace(')',' ')
        words = words.replace('+',' ')
        words = words.replace('-',' ')
        words = words.replace('`',' ')
        words = words.replace('\'',' ')
        words = words.replace('.',' ')
        words = words.replace('#',' ')
        words = words.replace('/',' ')
        words = words.replace('_',' ')
        words = words.replace('"',' ')
        words = words.replace("'",' ')
        return words.strip()
    return words

    
def remove_grammar(review):
    sentences = nltk.sent_tokenize(review)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    result_review = []
    for sentence in sentences:
        if sentences.strip():
            tagged_review = nltk.tag._pos_tag(sentence, tagset, tagger)
            cp = nltk.RegexpParser(grammar)
            result = cp.parse(tagged_review)
            result_review.append(traverseTree(result))
    return ''.join([word for word in result_review])
    
# Remove email
def remove_email(words):
    mod_words = ''
    if words:
        if words.strip():
            for word in words.split(' '):
                if (word.strip().lower()=='email') or (word.strip().lower()=='phn') or (word.strip().lower()=='phone') or (len(word.strip())<=1):
                    continue
                elif not re.match(r"[^@]+@[^@]+\.[^@]+", word.lower()):
                    mod_words = mod_words+' '+word
                #else:   
    else:
        return words
    return mod_words.strip()

In [17]:
cleanse_text(hi)

'hi im tri my best'

In [6]:
import os

os.getcwd()

'/Users/admin/Desktop/tryhard/marketAnalysis/NLPFinalClust/Isita'

In [7]:
df1 = pd.read_csv('../RawClustersOldBios/okcupid_profiles.csv')

In [8]:
df1

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,about me: i would love to think that i was so...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,i am a chef: this is what that means. 1. i am ...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,
2,38,available,m,straight,thin,anything,socially,,graduated from masters program,,...,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement conversation creation contemplation t...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ..."
3,23,single,m,straight,thin,vegetarian,socially,,working on college/university,white,...,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . . lynch, jarmusch...",,cats and german philosophy,,,you feel so inclined.
4,29,single,m,straight,athletic,,socially,never,graduated from college/university,"asian, black, other",...,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at: http://bagsbrown....,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians at the moment...",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,59,single,f,straight,,,socially,never,graduated from college/university,,...,"vibrant, expressive, caring optimist. i love b...",the happiest times have been when life came to...,i make an outstanding osso bucco. i am also ve...,"i am told that people notice my smile, eyes an...",i am an avid movie watcher and follow the broa...,"my family, my dog, italy, words and music!",writing my book.,"running with my dog, finishing up the work wee...",i have a dream to sing at the alconquin in nyc...,you are seeking a long term connection of shar...
59942,24,single,m,straight,fit,mostly anything,often,sometimes,working on college/university,"white, other",...,i'm nick. i never know what to write about mys...,currently finishing school for film production...,"filmmaking, photography, graphic design, web d...","dude, i don't know.","movies: hook (the greatest adventure ever!), g...",iphone contact lenses headphones camera tv rem...,i do most of my thinking on the bus to/from wo...,"bringin' home bacon, or drinking and shakin'!",when i was 18 i got a tattoo of waldo somewher...,meh if you made it this far you might as well.
59943,42,single,m,straight,average,mostly anything,not at all,never,graduated from masters program,asian,...,"hello! i enjoy traveling, watching movies, and...","i'm a civil engineer, who enjoys helping the c...",- looking at things objectively - getting thin...,i'm quiet until i get used to the environment ...,"last book: ""game change"". movies: bourne serie...",- iphone - friends and family - internet - bay...,"aside from work, how to improve my home.",out enjoying friendly conversation over dinner.,please let me think about this more.,we have similar interests.
59944,27,single,m,straight,athletic,mostly anything,socially,often,working on college/university,"asian, black",...,"""all i have in this world are my balls and my ...","following my dreams... ""you got a dream... you...",listening,it used to be the hair until i mowed it off bu...,where to begin musically: right now i listen t...,"music, family, friends, a basketball, hoop, so...",what can i do to make someone chuckle....,what i would do on any other day. everydays a ...,i like walking around in other people's house ...,you are interested and interesting...


In [9]:
df1 = df1.iloc[:1000]
df1 = df1.rename(columns={"essay0": "Bios"})


In [10]:
df1['Bios']=df1.Bios.astype('U').apply(cleanse_text)

# features = vec.transform(df['bio'].values.astype('U'))

In [11]:
df1['Bios']

0      about me would love to think that was some som...
1      am chef this is what that mean am workahol lov...
2      not asham of much but write public text on an ...
3                       work in librari and go to school
4      hey how it going? current vagu on the profil k...
                             ...                        
995    new to sf! am fun love easi go kinda guy with ...
996    use to be abl to say with impun weirder than y...
997    detroit nativ still retain some of my detroit ...
998                                                  nan
999    ve live here for the last 14 year love get awa...
Name: Bios, Length: 1000, dtype: object

In [12]:
vectorizer = CountVectorizer(analyzer='word',stop_words='english',decode_error='ignore',binary=True)    
    
counts = vectorizer.fit_transform(df1['Bios'])

In [13]:
from sklearn.metrics import silhouette_samples,silhouette_score, davies_bouldin_score
from sklearn.cluster import AgglomerativeClustering
n = 30

for cluster_num in range(2,n):
#     kmeans = KMeans(n_clusters=10,n_jobs=-1)
    
    hac = AgglomerativeClustering(n_clusters = cluster_num)
    labels = hac.fit_predict(counts.toarray())
    sil = silhouette_score(counts.toarray(), labels)
    db = davies_bouldin_score(counts.toarray(), labels)
    print('clus {}: {}, {}'.format(cluster_num, sil, db))

clus 2: 0.27849499950111306, 4.99566313259583
clus 3: 0.03440514767888696, 6.632290532574394
clus 4: 0.034895119647320276, 5.594897297968977
clus 5: 0.036440340963059696, 6.544000310761111
clus 6: 0.03663063626267732, 5.480517894083516
clus 7: 0.03688640957893574, 4.807724811713794
clus 8: 0.03725654790647779, 4.149590245778635
clus 9: 0.03734397656484661, 3.911045876418743
clus 10: 0.03756919131420898, 3.600099547284003
clus 11: 0.037977228031562435, 3.2336130393067637
clus 12: 0.03804017827529614, 3.028537394013693
clus 13: 0.03813315502881136, 3.042295947674904
clus 14: 0.038173961130145546, 2.910940217168783
clus 15: 0.038224725190175265, 2.848183193002145
clus 16: 0.03844936439333215, 2.683293704642785
clus 17: 0.03887259894204109, 2.4993739691532464
clus 18: 0.03899393863232745, 2.5763247545190726
clus 19: 0.03943953964647562, 2.4179613124728
clus 20: 0.03984123099129409, 2.2775423142091276
clus 21: 0.039918259174490205, 2.256680213688468
clus 22: 0.039968731947410234, 2.25554834

In [14]:
for cluster_num in range(2,n):
    kmeans = KMeans(n_clusters=10,n_jobs=-1)
    
#     hac = AgglomerativeClustering(n_clusters = cluster_num)
    labels = kmeans.fit_predict(counts.toarray())
    sil = silhouette_score(counts.toarray(), labels)
    db = davies_bouldin_score(counts.toarray(), labels)
    print('clus {}: {}, {}'.format(cluster_num, sil, db))



clus 2: 0.10045689832412881, 4.058856102182595




clus 3: 0.09295802897266554, 3.480459811428029




clus 4: 0.11777910525164324, 2.529772726971932




clus 5: 0.21912854226826023, 1.5715482497210909




clus 6: 0.13895436065934155, 2.536001569413954




clus 7: 0.1826946569954104, 2.656457548336255




clus 8: 0.10449292103043656, 2.5745870949866




clus 9: -0.023658001499357133, 1.678407559539594




clus 10: 0.11991696443664915, 2.9210571269866836




clus 11: 0.1312173799661905, 2.5907887709488966




clus 12: 0.02591516713190867, 3.6749433627406476




clus 13: 0.061705481912863604, 3.7550467286849902




clus 14: 0.13371742246586035, 2.229168988130211




clus 15: 0.12381869886274631, 2.5537943963175365




clus 16: -0.26987320566263906, 1.7200686459594194




clus 17: 0.08223098456752648, 2.4909933530322252




clus 18: 0.2242694421512048, 1.5725645534326698




clus 19: 0.0852418234539182, 2.582142933378408




clus 20: 0.21732880404861152, 1.6146443080295174




clus 21: 0.23660133294800947, 1.59531342803372




clus 22: 0.14174715084659678, 2.2757298327835587




clus 23: 0.17984995287096614, 2.3168642282840817




clus 24: 0.10012182630894989, 3.7173145715144145




clus 25: 0.10359903417317455, 2.5955780415242127




clus 26: 0.09521208450532363, 2.739241516596874




clus 27: 0.22944784107252045, 1.594627761579465




clus 28: 0.11325801608135065, 2.5593000113672963




clus 29: 0.22929867263940354, 1.591178780288207


In [23]:
n = 20
hac = AgglomerativeClustering(n_clusters = n)
labels1 = hac.fit_predict(counts.toarray())
df1['cluster_num'] = labels

In [16]:
os.mkdir('nlp_clus_okcupid_count')

In [24]:
def please(df, labels, n, someWord):
    #df_ori = pd.read_csv(filepath)
    df['cluster_num'] = labels
    #os.mkdir(os.getcwd() + '/' + someWord)
    
    
    for cluster in range(n):
        yes = df[df['cluster_num'] == cluster]
        save = yes['Bios']
        filename = '{}/{}.csv'.format(someWord, str(cluster))
        save.to_csv(filename)
    return df


In [25]:
please(df1,labels1,n,'nlp_clus_okcupid_count')

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,cluster_num
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...,3
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,,4
2,38,available,m,straight,thin,anything,socially,,graduated from masters program,,...,"i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement conversation creation contemplation t...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ...",3
3,23,single,m,straight,thin,vegetarian,socially,,working on college/university,white,...,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . . lynch, jarmusch...",,cats and german philosophy,,,you feel so inclined.,2
4,29,single,m,straight,athletic,,socially,never,graduated from college/university,"asian, black, other",...,work work work work + play,creating imagery to look at: http://bagsbrown....,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians at the moment...",,,,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,26,single,m,straight,fit,,socially,never,graduated from college/university,white,...,i earned a ba in media communications from poi...,"pool, ping pong, bowling, sports, singing...it...",i have glasses and i smile a lot,"movies: inglorious basterds, shutter island, t...",...in no particular order my dog friends movie...,,hanging out with friends at the bar playing po...,,"you're interested... and if you spell ""defini...",4
996,40,seeing someone,m,straight,average,,socially,sometimes,graduated from college/university,white,...,walking around a big city. waiting to see if ...,answering invasive questionnaires. creating s...,jesus christ you're a fucking freak. most peo...,a) pablum words b) pablum weird c) anything li...,"the great buddha once said: ""need is not my fa...",things to think about. the inherent value and...,yes. i am. i really am. i wouldn't lie about...,i have degrees in biology and anthropology. so...,"you found my pet rock. you like to shout ""chi...",4
997,31,single,f,straight,,mostly vegetarian,socially,,,"middle eastern, white",...,massage therapist and yoga teacher in training...,,"first that i'm pocket-size, and the ever so un...","books: burgess. burroughs, carroll, dahl, kero...",bike/legs music/book friends/family air/water/...,how did i get here? how do i work this? where ...,,,,2
998,34,single,f,straight,fit,,often,,working on space camp,white,...,,,,,,,,,,2


In [64]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word',stop_words='english',decode_error='ignore',binary=True)    
    
tf_counts = vectorizer.fit_transform(df1['Bios'])

n = 30

for cluster_num in range(2,n):
#     kmeans = KMeans(n_clusters=10,n_jobs=-1)
    
    hac = AgglomerativeClustering(n_clusters = cluster_num)
    labels = hac.fit_predict(counts.toarray())
    sil = silhouette_score(counts.toarray(), labels)
    db = davies_bouldin_score(counts.toarray(), labels)
    print('clus {}: {}, {}'.format(cluster_num, sil, db))

clus 2: 0.12207304161333878, 0.9169925009260752
clus 3: 0.14429030773375967, 1.5245901847120844
clus 4: 0.17274515923500525, 2.1087251289715496
clus 5: 0.2127270774327061, 2.0606431798697527
clus 6: 0.24732877171611725, 1.8692173412589843
clus 7: 0.2976040439340036, 1.6444848045985803
clus 8: 0.3434042424101895, 1.509522569324341
clus 9: 0.40164208534275736, 1.3334035594742384
clus 10: 0.46090134460201665, 1.1863830843825078
clus 11: 0.5096317295574622, 1.1167421529952601
clus 12: 0.5506100811091011, 1.09399865733298
clus 13: 0.5793906360264066, 1.1746593959851357
clus 14: 0.6066628043517414, 1.141135341777687
clus 15: 0.6117350543368423, 1.1308779203627994
clus 16: 0.6159718329068953, 1.0533045642716796
clus 17: 0.6671871205232816, 0.9937706356137612
clus 18: 0.6949958147929114, 0.9717567761955207
clus 19: 0.6992387466269874, 0.9120895362152738
clus 20: 0.7017572914435898, 0.8189064642990621
clus 21: 0.7052216161067173, 0.7582423029123139
clus 22: 0.7075676785289413, 0.713517271308251

In [65]:
n = 13
hac = AgglomerativeClustering(n_clusters = n)
labels1 = hac.fit_predict(counts.toarray())
df1['cluster_num'] = labels

os.mkdir('nlp_clus_tfidf')
please(df1,labels1,n,'nlp_clus_tfidf')

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Bios,Year,Major_Dept,Ambition,Goals,Music,Sports,Friday_Nights,School_balance,Movie_Genres,Social_Style,Transfer,cluster #,cluster_num
0,4,4,Internet nerd Award win coff trailblaz Social ...,3,2,1,1,7,1,2,1,8,2,1,3,1
1,10,10,Pop cultur trailblaz Coff maven Award win beer...,1,7,2,1,10,7,2,2,8,1,1,3,0
2,22,22,Problem solver Avid music buff Beer trailblaz ...,3,5,2,1,5,7,2,3,5,3,1,3,5
3,43,43,Evil social media lover Student Amateur thinke...,4,1,2,1,4,3,2,2,9,3,1,3,7
4,50,50,Friend problem solver Troublemak Subt charm so...,1,5,2,1,5,7,2,3,9,1,1,3,10
5,75,75,Beer geek Unapologet zombi ninja Award win mus...,3,4,2,1,6,10,2,2,4,3,1,3,1
6,88,88,Evil social media lover Student Amateur thinke...,2,2,3,1,9,1,2,1,9,1,1,3,7
7,95,95,Friend problem solver Troublemak Subt charm so...,3,7,3,1,4,3,2,2,4,1,1,3,10
8,113,113,Pop cultur geek Friend troublemak Student Weba...,3,2,1,1,4,9,2,2,3,3,1,3,1
9,115,115,Pop cultur trailblaz Coff maven Award win beer...,4,7,1,1,4,5,2,3,4,2,1,3,0
