In [1]:
from icego import *
import os
import boto3
import pandas as pd
import openai
from dotenv import load_dotenv

Neither PyTorch nor TensorFlow >= 2.0 have been found.Models won't be available and only tokenizers, configurationand file/data utilities can be used.


In [2]:
# Read in and clean the data
df = pd.read_csv(r"research_data\data.tsv", sep="\t").reset_index(drop=True)
questions = df[df.Question].reset_index(drop=True).drop(["User", "Time", "Question"], axis = 1)

ds_qs = questions[questions.Pos_Rating > questions.Neg_Rating].reset_index(drop=True).Message
other_qs = questions[questions.Pos_Rating < questions.Neg_Rating].reset_index(drop=True).Message
ans = df[df.Question].reset_index(drop=True).drop(["User", "Time", "Question"], axis = 1)

In [3]:
# Let's create some artificial questions to supplement our small dataset
# What should questions about data science cover?
# Programming languages: Python, R, Julia, Matlab
# Data visualization: Plotly, matplotlib, Tableau, PowerBI, 
# Machine Learning: sci-kit learn, sklearn, Tensorflow, PyTorch, neural network, regression, classification
# Data Engineering: cloud computing, Amazon Web Services, AWS, Microsoft Azure, Spark, Hadoop, SQL, NoSQL, database, pipeline, API, frontend, backend
# Statistics: t-test, p-value, AB test, distribution

ds_qs_artificial = pd.Series([
    "Which programming language do you prefer: Python, R, Julia, or Matlab?",
    "Should I use Plotly or matplotlib for visualization?",
    "What is the best way to learn Tableau and PowerBI?",
    "I need to train a regression and a classification model. Can this be done in sci-kit learn?",
    "Neural networks can be programmed in both Tensorflow and PyTorch, true or false?",
    "How do I create an SQL database in Azure?",
    "Is clustering an example of supervised or unsupervised learning?",
    "Is it possible to set up an API in AWS?",
    "Could you explain the difference between Spark and Hadoop?",
    "If I perform a chi-squared test and obtain a p-value of 0.02, is that considered statistically significant?",
    "Has anyone ever trained a transformer model for NLP?"
])

other_qs_artificial = pd.Series([
    "What?",
    "Who went to the club meeting today?",
    "Which programming language is preferable for embedded systems, C or C++?",
    "Is Blender used for creating 3D graphics?",
    "When he attached the wires, they made a spark. Does he need more training?",
    "How many neurons are contained within the human nervous system?",
    "The base of the model will be made in CAD - can you create this on your computer?",
    "Where was the largest cluster of cases found this month?",
    "Data Science?",
    "ajskdl qwerjksd weknwf we wejirknwdfw?",
    "R u ready to go?",
    "Has anyone seen the original Transformers movie?",
    "According to statistics is Michael Jordan the best basketball player ever?",
    "The wise old oak tree helped you make a decision?",
    "Did you know that just under half of all data science puns are below average?",
    "A SQL query walks into a bar, goes up to two tables, and asks \"Can I join you?\"",
])

X_val = [
    "Is the school open today?",
    "When is the meeting time this semester?",
    "What is the quantum Fourier transform",
    "I tried to push to GitHub but I got a merge conflict error. How do I fix this problem?",
    "Are you coming to Big Data Club tomorrow?",
    "Will members get those stickers for free?",
    "What data science books would recommend for understanding the basic principles?",
    "Where can I access datasets like MNIST for image analysis?",
    "Can someone help me understand how to SSH into the computing cluster on Friday?",
    "Where can I find the link?",
    "Which language should I use for matrix methods for data analysis",
    "Does anyone have any classes they would recommend for me to take next semester",
    "What is a Poisson distribution?",
    "Here's my idea: we combine lookup table and machine learning model. Thoughts?",
    "I've taken the class before, I really enjoyed it actually. Do you want me to send you the syllabus?",
    "Could everyone send times when you are available before the start of the semester?",
    "Here's a link on how to make custom Jupyter notebook themes. Big Data Club-themed notebooks, anyone?",
    "What's the best way to get user input in Python?",
    "Can you volunteer at the accepted students day @[redacted]?",
    "Which logo won the design contest?"
]
y_val = ['Other','Other','Data','Data','Other','Other','Data','Data','Data','Other', "Data","Other","Data","Data", "Other","Other","Data","Data", "Other", "Other"]

In [4]:
# Divide each variable into training and testing individually
ds_qs_artificial_train = ds_qs_artificial.sample(n=6)
ds_qs_artificial_test = ds_qs_artificial.drop(ds_qs_artificial_train.index)
other_qs_artificial_train = other_qs_artificial.sample(n=6)
other_qs_artificial_test = other_qs_artificial.drop(other_qs_artificial_train.index)

ds_qs_train = ds_qs.sample(n=7)
ds_qs_test = ds_qs.drop(ds_qs_train.index)
other_qs_train = other_qs.sample(n=7)
other_qs_test = other_qs.drop(other_qs_train.index)

# Set up training and testing sets
X_train = pd.concat([ds_qs_train, ds_qs_artificial_train, other_qs_train, other_qs_artificial_train], ignore_index = True)
X_test = pd.concat([ds_qs_test, ds_qs_artificial_test, other_qs_test, other_qs_artificial_test], ignore_index = True)

y_train = pd.concat([pd.Series(["Data"]*ds_qs_train.size), pd.Series(["Data"]*ds_qs_artificial_train.size), pd.Series(["Other"]*other_qs_train.size), pd.Series(["Other"]*other_qs_artificial_train.size)], ignore_index=True)
y_test = pd.concat([pd.Series(["Data"]*ds_qs_test.size), pd.Series(["Data"]*ds_qs_artificial_test.size), pd.Series(["Other"]*other_qs_test.size), pd.Series(["Other"]*other_qs_artificial_test.size)], ignore_index=True)

# Shuffle the order of the train and test set
shuffle_train = sample(list(X_train.index), len(X_train.index))
shuffle_test = sample(list(X_test.index), len(X_test.index))

X_train = X_train[shuffle_train]
y_train = y_train[shuffle_train]
X_test = X_test[shuffle_test]
y_test = y_test[shuffle_test]

In [5]:
# Total Questions
a = len(ds_qs)
print("%d DS questions"%(a))
b = len(other_qs)
print("%d non-DS questions"%(b))
c = len(ds_qs_artificial)
print("%d artificial DS questions"%(c))
d = len(other_qs_artificial)
print("%d artificial non-DS questions"%(d))
e = a + b + c + d
print("%d total questions"%(e))

15 DS questions
10 non-DS questions
11 artificial DS questions
16 artificial non-DS questions
52 total questions


In [7]:
validation = pd.DataFrame({"question":X_test.reset_index(drop=True), "label":y_test.reset_index(drop=True)})
validation.to_csv(r"research_data\validation(2).tsv", sep="\t")

In [8]:
testing = pd.DataFrame({"question":X_val, "label":y_val})
testing.to_csv(r"research_data\testing(3).tsv", sep="\t")

In [9]:
training = pd.DataFrame({"question":X_train.reset_index(drop=True), "label":y_train.reset_index(drop=True)})
training.to_csv(r"research_data\training(1).tsv", sep="\t")