In [47]:
import PyPDF2
import re
from IPython.display import display, Markdown

import os
import openai

import json
credentials = json.load(open('keys.json'))
openai.organization = credentials['organization_key']
openai.api_key = credentials['api_key']

In [2]:
class Section:
    def __init__(self, number, title, text, lo, gpt_notes = ""):
        self.number = number
        self.title = title
        self.text = text
        self.lo = lo
        self.gpt_notes = gpt_notes

    def __str__(self):
        lo_string = "\n\n".join(f'*{lo}*' for lo in self.lo)
        return f'**{self.number} {self.title}**\n\n{lo_string}\n\n{self.gpt_notes}'
    
    def display(self):
        display(Markdown(str(self)))

In [3]:
def get_pdf_content(file_path):
    pdf_file_obj = open(file_path, 'rb')
    reader = PyPDF2.PdfReader(pdf_file_obj)

    content = ""
    for page_num in range(len(reader.pages)):
        page_obj = reader.pages[page_num]
        content += page_obj.extract_text()

    pdf_file_obj.close()

    return content

def find_sections(content):
    pattern = r"(7\.\d+(\.\d+)*)(.*?)(?=(7\.\d+(\.\d+)*)|$)"
    sections_raw = re.findall(pattern, content, re.DOTALL)
    sections = []
    for t in sections_raw:
        number = t[0]
        title = t[2].split("\n")[0].lstrip().rstrip()
        text = t[2]
        text = text.replace(title, '').replace("This element addresses the following learning objective of this course:\n", '').replace("This element addresses the following learning objectives of this course:", '').replace("\n", " ").replace("● ", "").replace("  ", " ").lstrip().rstrip()
        text_without_lo = remove_lo(text)
        lo = get_lo(text)
        sections.append(Section(number, title, text_without_lo, lo))
    return sections

def get_lo(content):
    pattern = r"(LO\d+:.*?[.!?])"
    los = re.findall(pattern, content)
    return [re.sub(r'LO\d+: ', '', lo) for lo in los]

def remove_lo(content):
    pattern = r"(LO\d+:.*?[.!?])"
    return re.sub(pattern, '', content, flags=re.MULTILINE)

In [4]:
# Read in the PDF
pdf_content = get_pdf_content('Week 7 transcript  (updated Feb 22).pdf')

In [5]:
# Overwrite all the workd that is done. DO NOT DO THIS UNLESS YOU WANT TO START OVER
sections = find_sections(pdf_content)

In [6]:
for section in sections:
    if section.text == "" or "on the following prompt" in section.text:
        continue
    if section.gpt_notes != "":
        continue
    else:
        gpt_content = f"""Topic: {section.title}
        Detail Level: I would like something between high-level summary and detailed notes.
        Learning Objectives: {'; '.join(section.lo)}
        Format: I prefer the notes to be in bullet-point or table format.
        Course Transcript: {section.text}
        Purpose: The notes are for my personal understanding and to facilitate in class discussion with my peers."""
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", 
                 "content": gpt_content}
                 ])
        section.gpt_notes = completion.choices[0].message.content

    print(section.title, "- Complete")

Data Collection Philosophy Complete
Existing or New Data Complete
Acquire and Clean Data Complete
Overview of Methods Complete
Tips on How to Choose a Method Complete
Internal and External Validity Complete
The Intersection of Data, Method, and Complete
Introduction to a Perspective from IBM’s Journey Complete
 Complete
 Complete
 Complete
Design Research to Generate Applied Insight Complete
Keep the Audience in Mind Complete
Communicate Insight, Hypotheses Testing, and the Word “Prove” Complete
Different Deliverables Complete
Different Audiences Complete
Creswell and Creswell, Chapter 10: Mixed Methods Procedures Complete
The Data Detective by T im Harford Complete


In [39]:
for section in sections:
    if section.gpt_notes == "":
        # print(section.title)
        display(Markdown('**' + str(section.title) + '**'))
    else:
        section.display()

**Data Collection**

**7.1.1 Data Collection Philosophy**

*Assess and select specific data and the data collection methods that best fit a specific outcome or need.*

- Earlier philosophy: collect as much data as possible and worry about what to do with it later
- Current philosophy: collect as much data as you need and not more
- Strategic collection of data can minimize unnecessary risk
- Collect data with purpose to avoid paying for unnecessary server space
- International and regional policies on data protection and privacy affect data collection philosophy
- Assess and select data collection methods that best fit specific outcome or need
- Collecting too much data requires storage and analysis of larger quantities of data
- Purposeful data collection helps avoid sifting through unnecessary data 
- Collecting only necessary data can help ensure compliance with regulations on data protection and privacy

**7.1.2 Existing or New Data**

*Assess and select specific data and the data collection methods that best fit a specific outcome or need.*

- Deciding between using existing or new data
- Trade-offs between off-the-shelf data and new data collection
- Consider the purpose of the initial data collection when deciding if existing data is sufficient
- Example of when existing data may be sufficient: auto manufacturer collects data on production and output, and wants to test a new piece of equipment for efficiency. Leadership wants to deploy it in the lines with the most potential for impact, even if it's in lower yield production lines.
- Example of when existing data may not be sufficient: major retailer has data on curbside pickup popularity but wants to consider the viability of home delivery. Can't automatically extrapolate and should consider collecting new data about users' willingness to embrace home delivery or using industry data.
- Be aware of the initial intent of why the data was collected and articulate how existing data will help answer the question.

**7.1.3 Acquire and Clean Data**

*Assess and select specific data and the data collection methods that best fit a specific outcome or need.*

Acquiring and Cleaning Data:

Acquiring Data:
- Identify the data needed for a specific outcome or need
- Determine the best data collection method (Surveys, interviews, experiments, observations, etc.)
- Consider the quality and reliability of the data source
- Obtain permission to access necessary data

Cleaning Data:
- Identify missing or incomplete data
- Remove duplicate data
- Standardize data formats (e.g., date and time formats)
- Remove outliers and errors
- Transform data to ensure the data is consistent with analysis objectives
- Ensure data accuracy and completeness
- Label variables clearly to make it easier to work with in the subsequent analysis

Assessing Data Quality:
- Completeness: Is the data missing any values or variables?
- Accuracy: How precise is the data?
- Validity: Is the data relevant and appropriate for the analysis?
- Consistency: Are there any discrepancies or errors in the data?
- Timeliness: Is the data up-to-date and relevant to the current analysis?

Overall, acquiring and cleaning data can be a time-consuming process, but it is essential to obtain reliable and usable data for analysis. It is crucial to identify specific data needs, select the best data collection method, and assess the data quality to ensure the data is accurate, complete, and valid.

**The Intent of Data Collection**

**Method Selection**

**7.3.1 Overview of Methods**

*Design and apply research questions.

Justify an analytic approach that informs decision making.*

- Research approach is the broad plan that includes worldview, design, and methods.
- Design articulates type of inquiry, such as qualitative, quantitative, or mixed, and specific procedures of the study.
- Method includes plan for data collection, analysis, and interpretation.
- Methods vary based on industry, task, and year, and specific tools or programs are not mentioned.
- Examples of methods include surveys, in-depth interviews, semi-structured interviews, structured interviews, ethnographies, experiments, observational studies, cross-sectional studies, single slice in time studies, and time series studies.
- The notes are for personal understanding and to facilitate discussion with peers. 
- Learning objectives include designing and applying research questions and justifying analytic approaches for decision making.

**7.3.2 Tips on How to Choose a Method**

*Design and apply research questions.

Justify an analytic approach that informs decision making.*

Tips on How to Choose a Research Method:

- Determine the type of answer you're looking for
- Decide between a qualitative, quantitative, or mixed-method approach
- Consider the familiarity of the audience with different methods
- Determine if statistical significance or confidence metrics are required
- Consider the need to explain the method to others
- Take into account the expected sample size
- Consider the time available to complete the project
- Evaluate existing information and determine if it can be built upon or if starting from scratch is necessary
- Let the research question and needs of the project guide the selection of the method

**7.3.3 Internal and External Validity**

*Design and apply research questions.

Justify an analytic approach that informs decision making.*

- Internal validity threats limit the validity of conclusions drawn from a study design
- Example: Experiment with human subjects where tablets are given to some students to see impact on attention span
- Diffusion is an internal validity threat where control group interacts with treatment group
- Solution may be to assign treatment at class level instead of individual level
- External validity is when sample, setting, or timing of project may limit validity of conclusions drawn
- Example: Economic downturn during project may limit extrapolation to other time periods
- External validity threats are often not within our control
- Understanding and addressing internal and external validity threats is important for justifying analytic approach and decision making in research design.

**Interview or Survey**

**7.5 The Intersection of Data, Method, and**

**

- Data, method, and detail level are interconnected in the analysis process
- The level of detail in the data and method used can impact the accuracy and reliability of the analysis
- The level of detail in the analysis should be aligned with the objectives and requirements of the project
- Data should be collected and analyzed at the appropriate level of detail to avoid overlooking important insights or introducing bias
- Different methods can be applied to the same data to achieve different levels of detail and precision
- It's important to balance the level of detail with the practicality and timeliness of the analysis process
- Collaboration and communication with stakeholders can help ensure that the level of detail and method used are appropriate and aligned with their needs
- Iterative analysis and refinement may be necessary to adjust the level of detail and method used based on new insights or feedback. 

Table Format Example:

| Key Points                                 |
|-------------------------------------------|
| Data, method, and detail level are interconnected in the analysis process |
| The level of detail in the data and method used can impact the accuracy and reliability of the analysis       |
| The level of detail in the analysis should be aligned with the objectives and requirements of the project |
| Data should be collected and analyzed at the appropriate level of detail to avoid overlooking important insights or introducing bias |
| Different methods can be applied to the same data to achieve different levels of detail and precision |
| It's important to balance the level of detail with the practicality and timeliness of the analysis process |
| Collaboration and communication with stakeholders can help ensure that the level of detail and method used are appropriate and aligned with their needs |
| Iterative analysis and refinement may be necessary to adjust the level of detail and method used based on new insights or feedback |

**7.5.1 Introduction to a Perspective from IBM’s Journey**

*Justify an analytic approach that informs decision making.

Navigate organizational, personal, legal, and ethical constraints to facilitate better decision making and improve communication.*

- Irving Wladawsky-Berger has a unique perspective on the development of business intelligence and data science at IBM
- Pay attention to his insights on early manifestations of data mining and the relationship between pattern matching and intelligence
- He also has a perspective on models as a philosophy of science and associates elegance with deductive modeling
- The question of whether elegance still matters in modeling is important and related to some of the reading for the week
- Irving is a humanist and cares about the impact of IBM's work on people and society
- The notes are for personal understanding and in-class discussion.

**7.5.2 **

*Justify an analytic approach that informs decision making.

Navigate organizational, personal, legal, and ethical constraints to facilitate better decision making and improve communication.*

- Irving Wladavsky-Berger, a retired IBM executive and strategic advisor at Citigroup, believes that the biggest impact of data science will be in people-oriented disciplines such as social sciences, healthcare, business, management, and government.
- The concept of data mining was initially used by high energy physics community to gather large amounts of data and apply sophisticated algorithms and powerful supercomputers to look for patterns to discover new particles.
- Companies started using data mining for business intelligence to find patterns in marketing, sales, and markets. The financial industry applied it for fraud detection.
- In 1997, IBM's Deep Blue supercomputer beat then-world champion Garry Kasparov in a chess match. Deep Blue was a pattern matching induction machine that evaluated probabilities of different moves.
- While applying data science to simpler decision-making processes like inventory management or fraud detection is relatively straightforward, applying it to more complex decision-making processes like healthcare diagnosis or business strategy evaluation requires modeling.
- To develop a better understanding of how people make complex decisions, new disciplines will need to emerge around decision-making.

**7.5.3 **

*Justify an analytic approach that informs decision making.

Navigate organizational, personal, legal, and ethical constraints to facilitate better decision making and improve communication.*

- Irving emphasizes that data will have a profound impact on human decision making, but it will take longer than expected.
- We need to focus on models of what people value in order to train algorithms that align with human goals.
- The problem is that human decision processes and values are complex and difficult to optimize for.
- Choosing a training set for algorithmic decision making involves engaging with basic philosophical debates about human values.
- It is impossible to avoid thinking about the relationship between the data set and the goals that we are trying to serve when working with important human goals.
- Phil Nolan, an IBMer specialized in decision making, will discuss IBM's Watson system and its use for decision making.
- Phil's perspective is complementary to Irving's, with a focus on the demand function rather than the supply function.
- Phil has a sense of humor and will make the discussion more enjoyable.

**7.5.4 **

*Design and apply research questions.

Justify an analytic approach that informs decision making.

Navigate organizational, personal, legal, and ethical constraints to facilitate better decision making and improve communication.*

- Phil Nolan has been working on the intersection of technology and analytics for over 20 years, with a focus on solving difficult problems.
- He is currently working with IBM on big data applications such as Watson.
- Structured data (e.g. rows and columns like an Excel spreadsheet) and unstructured data (e.g. text, video, or audio where information comes from the context around it) are important distinctions to make in big data.
- Watson was originally created to win at Jeopardy and demonstrate IBM's technical expertise around natural language processing and machine learning.
- Watson's ability to take natural language questions as input and return answers and evidence with confidence from a large corpus of unstructured text is a remarkable achievement.
- Watson is being used in healthcare for treatment decisions, financial research for investing, and in the federal government for analytics with unstructured text.
- Defining the problem tightly enough can make it more tractable and easier to apply technology solutions.
- In large data sets, patterns can be identified with powerful tools and data scientists, but when dealing with a data set collected for a purpose other than the question being asked, brilliant insights are necessary before the right tools can be applied.
- The mixture of inductive and deductive methods is likely in the work Phil is doing with big data.

**Analysis**

**7.6.1 Design Research to Generate Applied Insight**

*Design and apply research questions.

Justify an analytic approach that informs decision making.*

- Design research must have a clear goal and objective to ensure meaningful insight is generated.
- The goal of the research will influence the design of the study and the approach taken, i.e., descriptive, predictive, or explanatory.
- The need for a recommendation will impact the design of the project, i.e., exploring various decisions versus simply predicting an outcome.
- Consider the level of confidence the audience desires and ensure the current design can provide that level of confidence. If not, communicate what is needed.
- Statistical significance is important, but the effect size should also be considered to avoid comments of unclear significance.
- The primary question to consider is what the goal of the project is and how to design a study to achieve that goal.

**7.6.2 Keep the Audience in Mind**

*Design and apply research questions.

Justify an analytic approach that informs decision making.

Identify the audience and the most ef fective method to communicate a persuasive argument.*

- Analytical strategy should consider the audience's priors and incentive structure
- The narrative and vocabulary should be tailored to persuade the audience
- The communication strategy should influence the analysis chosen
- Consider the audience's level of technical expertise
- Design a project that the audience is comfortable with or provide sufficient information for unfamiliar frameworks
- Approach will differ based on whether the audience wants to be informed or helped to make a decision

**Deliverables**

**7.7.1 Communicate Insight, Hypotheses Testing, and the Word “Prove”**

*Justify an analytic approach that informs decision making.

Identify the audience and the most ef fective method to communicate a persuasive argument.*

- The word "prove" should be used cautiously in research and hypothesis testing because it implies a sense of certainty and conclusiveness that scientists may not be comfortable with. 
- Rejecting the null hypothesis does not necessarily prove the alternative hypothesis, as the same body of evidence can be consistent with other hypotheses. 
- Alternative explanations should be considered to avoid ignoring other potential factors that may affect the results. 
- Business contexts may prefer terms like "suggest," "find," or "support" instead of "prove" because they focus on providing recommendations rather than definitive conclusions. 
- Using the word "prove" in meetings can create tension and spark unproductive conversations. Choosing language that demonstrates some level of certainty but without an extreme sense of right and wrong may be more effective. 
- When communicating insights, consider the audience and determine the most effective method to communicate a persuasive argument. 
- Justifying an analytic approach that informs decision-making requires careful consideration of evidence and alternative explanations.

**7.7.2 Different Deliverables**

*Justify an analytic approach that informs decision making.

Identify the audience and the most ef fective method to communicate a persuasive argument.*

- Final deliverable will depend on the audience and objective of research
- Possible deliverables:
  - Slide deck and presentation
  - Paper or report
  - Proof of concept or minimum viable product
  - Code-based deliverables
  - New model
- Important to future-proof work, consider future self or how someone else may interpret code
- Be sure to annotate to aid future understanding

**7.7.3 Different Audiences**

*Justify an analytic approach that informs decision making.

Identify the audience and the most ef fective method to communicate a persuasive argument.*

- When communicating, it's important to remember who your audience is
- The intended audience should be a guiding heuristic for communication
- The audience will impact the way you communicate in terms of vocabulary and emphasis on project aspects
- The level of technical expertise will also impact communication style
- Incentive structure is another factor to consider - the C-suite may care more about high-level recommendations and narrative compared to a technical team who may be convinced by nuances
- Justifying an analytic approach should be tailored to the audience and what will be persuasive to them
- Identifying the most effective method to communicate a persuasive argument is crucial for decision making.

**Creswell and Creswell Textbook**

**7.8.1 Creswell and Creswell, Chapter 10: Mixed Methods Procedures**

*Design and apply research questions.

Assess and select data and the data collection methods that best fit a specific outcome or need.

Justify an analytic approach that informs decision-making.*

Mixed Methods Procedures - Creswell and Creswell, Chapter 10:
- Mixed methods is a combination of quantitative and qualitative research methods to address research problem
- It is important to ensure that the purpose of the study and the purpose of the mixed method approach is made clear
- There are three high-level approaches to mixed method research:
  - Quantitative and qualitative are done simultaneously to see if the results converge
  - Qualitative work informs quantitative work by identifying the relationships to be studied
  - Quantitative study is done first to identify interesting patterns, followed by qualitative approach to explain the quantitative results
- Worldview, or our view about science and how the world works, can influence problem definition, the questions we ask, and which findings we emphasize
- Acknowledging and being mindful of our worldview throughout the project can help us avoid biases and be more open-minded
- Awareness of our biases can help us slow down our thinking and consider alternative perspectives that cannot be easily measured.

**7.9.1 The Data Detective by T im Harford**

*Design and apply research questions.

Assess and select data and the data collection methods that best fit a specific outcome or need.

Justify an analytic approach that informs decision-making.*

- Tim Harford's book "The Data Detective" emphasizes the importance of approaching statistics with a healthy level of skepticism and curiosity.
- The introduction shows a contrast with "How to Lie with Statistics" but doesn't promote fear of statistics.
- Rule 6: Ask Who is Missing encourages thinking about who is missing from data sets and studies, including gender and racial minorities.
- It's important to consider both sample error and sample bias in analyzing data.
- The Golden Rule: Be Curious emphasizes the value of curiosity, embracing failure, and storytelling in the research and decision-making process.
- "Tell me more" summarizes the book's invitation to learn more and embrace a healthy skepticism and curiosity.