<a href="https://colab.research.google.com/github/sethjmarcus/seth_tiqc/blob/master/summary_of_responses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse.linalg import svds
import networkx
from gensim.summarization import summarize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Needed for networkx
# !pip install --upgrade scipy networkx

In [3]:
# Get documents
post_survey_1 = pd.read_excel('/content/Post-InternshipServeyResponses.xlsx', sheet_name="Form Responses 1")
post_survey_2 = pd.read_excel('/content/TIQC_Post-InternshipSurveyResponses.xlsx', sheet_name="Form Responses 1")
pre_survey = pd.read_excel('/content/Pre-InternshipSurveyResponses.xlsx', sheet_name="Form Responses 1")

In [54]:
# select relevant columns
post_survey_1 = post_survey_1[['What did you gain from your internship?',
       'Did you feel adequately prepared for your internship? What components of your education aided you or do you wish you had more of?',
       'Compile a list of three outcomes from your internship. Are these the ones you were looking for?',
       'What three words would you use to summarize your internship?']]
post_survey_2 = post_survey_2[['What key skills you think you have developed through this internship?']]
pre_survey = pre_survey [['What do you want to achieve from this internship opportunity?',
       'Do you believe you are sufficiently prepared for your internship experience? Why or why not? ',
       'List three outcomes you hope to attain by the end of your internship.',
       'What three skills will you bring to the internship?']]

# Edit columns into strings. Want to summarize all of the entries per column
post_survey_1_strings = {}
post_survey_2_strings = {}
pre_survey_strings = {}

for df, df_string in zip([post_survey_1, post_survey_2, pre_survey], [post_survey_1_strings, post_survey_2_strings, pre_survey_strings]):
  for colname in df.columns:
    df_string[colname] = ' '.join(df[colname])
    df_string[colname] = re.sub(r'\n|\r', ' ', df_string[colname])
    df_string[colname] = re.sub(r' +', ' ', df_string[colname])
    df_string[colname] = df_string[colname].strip()

In [5]:
# Basic Text Pre-processing
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [6]:
def low_rank_svd(matrix, singular_count=2):
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

In [113]:
def pipeline(input_string, key, n_lines):
  sentences = nltk.sent_tokenize(input_string)

  normalize_corpus = np.vectorize(normalize_document)
  norm_sentences = normalize_corpus(sentences)

  # Text Representation with Feature Engineering
  tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
  dt_matrix = tv.fit_transform(norm_sentences)
  dt_matrix = dt_matrix.toarray()

  vocab = tv.get_feature_names()
  td_matrix = dt_matrix.T

  # Latent Semantic Analysis
  num_sentences = n_lines
  num_topics = min(8, td_matrix.shape[1]-1)

  u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
  # print(u.shape, s.shape, vt.shape)
  term_topic_mat, singular_values, topic_document_mat = u, s, vt

  # remove singular values below threshold                                         
  sv_threshold = 0.5
  min_sigma_value = max(singular_values) * sv_threshold
  singular_values[singular_values < min_sigma_value] = 0

  salience_scores = np.sqrt(np.dot(np.square(singular_values), 
                                 np.square(topic_document_mat)))

  top_sentence_indices = (-salience_scores).argsort()[:num_sentences]
  top_sentence_indices.sort()

  # send this output to a .txt file under 'Latent Semantic Analysis'
  filename1 = "/content/summary_files/LSM/"+k+".txt"
  file1 = open(filename1, "w+")
  file1.write('\n'.join(np.array(sentences)[top_sentence_indices]))
  file1.close()
  # print('\n'.join(np.array(sentences)[top_sentence_indices]))

  # TextRank
  # Build Similarity Matrix
  similarity_matrix = np.matmul(dt_matrix, dt_matrix.T)
  np.round(similarity_matrix, 3)

  # Build Similarity Graph
  similarity_graph = networkx.from_numpy_array(similarity_matrix)

  # Get Sentence Importance Scores
  scores = networkx.pagerank(similarity_graph)
  ranked_sentences = sorted(((score, index) for index, score 
                                            in scores.items()), 
                          reverse=True)
  top_sentence_indices = [ranked_sentences[index][1] 
                        for index in range(num_sentences)]

  # send this output under folder 'TextRank
  filename2 = "/content/summary_files/text_rank/" + k + ".txt"
  file2 = open(filename2, "w+")
  file2.write('\n'.join(np.array(sentences)[top_sentence_indices]))
  file2.close()

In [114]:
for k in pre_survey_strings.keys():
  pipeline(pre_survey_strings[k], k[:-1], 50)
for k in post_survey_1_strings.keys():
  pipeline(post_survey_1_strings[k], k[:-1], 5)
for k in post_survey_2_strings.keys():
  pipeline(post_survey_2_strings[k], k[:-1], 20)

