<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Read-Data" data-toc-modified-id="Read-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Read Data</a></span></li><li><span><a href="#Clean-the-answers" data-toc-modified-id="Clean-the-answers-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Clean the answers</a></span></li><li><span><a href="#LDA---Topic-Modelling" data-toc-modified-id="LDA---Topic-Modelling-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>LDA - Topic Modelling</a></span></li><li><span><a href="#Results" data-toc-modified-id="Results-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Results</a></span></li></ul></div>

#### Imports

In [1]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import ngrams
import nltk


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Associate\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Associate\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Read Data

In [156]:
df = pd.read_csv("Marc/q20_responses.csv")
df.head()

Unnamed: 0,"Based on your experience with the content provided on the Public Roadmap, how does that information influence your sales process? Please explain."
0,I jump straight to the roadmap companion and h...
1,Again when I say something is 'coming' I can p...
2,"From an OCP perspective, we are trying to enco..."
3,As we are less feature rich then competitors l...
4,it helps me to guide customers better


In [157]:
df.columns

Index(['Based on your experience with the content provided on the Public Roadmap, how does that information influence your sales process? Please explain.'], dtype='object')

In [158]:
df = df.rename(index=str, columns={'Based on your experience with the content provided on the Public Roadmap, how does that information influence your sales process? Please explain.': 'answer'})

#### Clean the answers

In [159]:
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

def clean_answer(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [160]:
df['general_clean'] = df.answer.apply(clean_answer)
df['remove_stopwords'] = df['general_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [161]:
#df['third_iter'] = df.remove_stopwords.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() ]))

In [162]:
df['fourth_iter'] = df.remove_stopwords.apply(lambda x: ' '.join([lemmatizer.lemmatize(word,'v') for word in x.split( )]))

In [163]:
df_clean = df[['fourth_iter']]

#### LDA - Topic Modelling

In [164]:
def top_n_grams(min, max, df, col):
    """calculate the n-grams"""
    word_vectorizer = CountVectorizer(ngram_range=(min, max), analyzer='word')
    sparse_matrix = word_vectorizer.fit_transform(df[col])
    frequencies = sum(sparse_matrix).toarray()[0]
    
    df_out = pd.DataFrame(frequencies, 
             index=word_vectorizer.get_feature_names(), 
             columns=['frequency']).reset_index().sort_values(by = ['frequency'], ascending = False)
    
    return(df_out.head(10))

In [165]:
top_n_grams(4,6,df_clean, 'fourth_iter')

Unnamed: 0,index,frequency
0,ability sell well tell,1
1341,prospect want look roadmap,1
1354,provide confidence capabilities feature releas...,1
1353,provide confidence capabilities feature release,1
1352,provide confidence capabilities feature,1
1351,provide come without detail many case,1
1350,provide come without detail many,1
1349,provide come without detail,1
1348,prove microsoft still invest build dynamics,1
1347,prove microsoft still invest build,1


In [166]:
top_n_grams(4,6,df_clean, 'fourth_iter').to_csv('Marc/Q20_most_repeated_phrases.csv')

In [167]:
def generate_df(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        topic = [" ".join([feature_names[i]
                           for i in topic.argsort()[:-n_top_words - 1:-1]])]
        topic.append(topic_idx + 1)
        topics.append(topic)
    df = pd.DataFrame(topics)
    df.columns = ["topic", "topic_number"]
    return df


def lda_model(df, n_topic=5, n_word=5, max_features=1000):
    '''model for latent dirichlect allocation'''
    lda = LatentDirichletAllocation(n_components=n_topic, max_iter=10,
                                    learning_method='online', learning_offset=10., random_state=42)
    tfid = TfidfVectorizer(max_df=0.95, min_df=3, max_features=max_features)
    tfidf_text = tfid.fit_transform(df.fourth_iter)
    lda_text = lda.fit(tfidf_text)
    tfidf_feature_names = tfid.get_feature_names()
    lda_df = generate_df(lda_text, tfidf_feature_names, n_word)
    return lda_df

#### Results

In [181]:
df_result = lda_model(df_clean,10, 6)

In [182]:
df_result

Unnamed: 0,topic,topic_number
0,take product information cloud help roadmap,1
1,customer much come feature allow future,2
2,ahead release date need go better,3
3,need able feature sell roadmap influence,4
4,product customers investment plan help customer,5
5,vision microsoft come point say application,6
6,might use specific feature better interest,7
7,customers roadmap detail show understand time,8
8,customers better help detail us give,9
9,content help include useful months provide,10


In [180]:
df_result.to_csv('Marc/Q20_top_6_topics.csv')