In [14]:
import numpy as np # linear algebra
import pandas as pd # data processing
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [26]:
path = '/Users/veera/Documents/Biopython/NLP/Keyword Extraction.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,index,id,title,texts,labels,forum
0,0,7,What do INFJs look for in a relationship?,"[""I'm an INFJ, and so far I have not had any l...",['Dating & Relationships'],Dating & Relationships
1,1,9,What do you prefer?,"[""I'm a Sexual-Social. I never really used Ins...","['Sexual-Socials', 'Instincts']",Sexual-Socials
2,2,11,Secure-sexuals...unite!,['Who else is a secure-sexuals out there? Post...,"['Sexual-Secures', 'Instincts']",Sexual-Secures
3,3,12,Which type makes the best actor/performer?,['I used to do theatre...there are a bunch of ...,"['General Topics', 'Career & Education']",Career & Education
4,4,14,What makes you angry,"[""What pisses of ENFJs more than anything else...","['Romantics ', 'ENFJ - Teacher', 'Personality']",ENFJ - Teacher


In [27]:

stop_words = set(stopwords.words('english'))

##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))



In [29]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    
    return ' '.join(text)
docs = df['texts'].apply(lambda x:pre_process(x))

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
#docs = docs.tolist()
#create a vocabulary of words, 
cv=CountVectorizer(max_df=0.95,         # ignore words that appear in 95% of documents
                   max_features=10000,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
word_count_vector=cv.fit_transform(docs)

In [31]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [32]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [33]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [34]:
# get feature names
feature_names=cv.get_feature_names()

def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

In [36]:
def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Texts=====")
    print(df['texts'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])
idx=443
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
How to get an INFP as an INTJ?

=====Texts=====
['How to get an INFP as an INTJ', 'god this site is so shitty ignore the triple post', '.', "shes not interested bro. no idea what you look like but even if you're not uggo, INFP chicks need to be led, conquered - you just sound too high inhib for something like that", '@Powren You got this man. I\'ve never dated an INFP. I know quite a few though. Uhh, I mean, with any person you want to grow closer to, the first thing to do is to talk to them.\n\nI got a million different strategies I could give you to start this. Have you talked to her yet? If you really genuinely like her character, once you\'ve talked to her at least a few times, you can tell her "Hey, I like talking to you, can I buy you a cup of tea at Starbuck and discuss [insert preferred subject here].', 'come on he needs to listen to his gut. undeveloped intjs listen to it all the time but in the face of a possible fu-- i mean possible relationship they suddenl