In [1]:
#pip install google-cloud-bigquery

In [2]:

from google.cloud import bigquery
import pandas as pd
import numpy as np

In [3]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="client_secret.json"

In [4]:
client = bigquery.Client()

In [5]:
hn_dataset_ref = client.dataset('stackoverflow', project='bigquery-public-data')
hn_dataset_ref

DatasetReference('bigquery-public-data', 'stackoverflow')

In [6]:
type(hn_dataset_ref)

google.cloud.bigquery.dataset.DatasetReference

In [7]:
hn_dset = client.get_dataset(hn_dataset_ref)
hn_dset

Dataset(DatasetReference('bigquery-public-data', 'stackoverflow'))

In [8]:
type(hn_dset)

google.cloud.bigquery.dataset.Dataset

In [9]:
[x.table_id for x in client.list_tables(hn_dset)]

['badges',
 'comments',
 'post_history',
 'post_links',
 'posts_answers',
 'posts_moderator_nomination',
 'posts_orphaned_tag_wiki',
 'posts_privilege_wiki',
 'posts_questions',
 'posts_tag_wiki',
 'posts_tag_wiki_excerpt',
 'posts_wiki_placeholder',
 'stackoverflow_posts',
 'tags',
 'users',
 'votes']

In [10]:
hn_full = client.get_table(table='bigquery-public-data.stackoverflow.posts_questions')
hn_full

Table(TableReference(DatasetReference('bigquery-public-data', 'stackoverflow'), 'posts_questions'))

In [11]:
type(hn_full)

google.cloud.bigquery.table.Table

In [12]:
query_job = client.query(
        """
        SELECT
        *
        FROM `bigquery-public-data.stackoverflow.posts_questions`
        WHERE accepted_answer_id is not null AND creation_date > '2021-01-01'
        ORDER BY accepted_answer_id
        LIMIT 10"""
    )

results = query_job.result()  # Waits for job to complete.
    # [END bigquery_simple_app_query]

    # [START bigquery_simple_app_print]
# for row in results:
#     print(row)
results_list = [row for row in results]

In [13]:
cols = [field.name for field in results.schema]

In [14]:
post_questions = pd.DataFrame.from_records(results_list, columns=cols)
post_questions

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,65526420,How to check if values in individiual rows of ...,<p>Suppose I have the following data.table:</p...,65526457,2,2,,2021-01-01 00:05:46.310000+00:00,2.0,2021-01-01 02:15:42.223000+00:00,NaT,,,,6335637,,1,2,r|data.table|rowwise,62
1,65526423,d3.js : Generating axis ticks for ordinal values,<p>I want to use ordinal scale in x-axis with ...,65526533,1,0,,2021-01-01 00:06:09.007000+00:00,,2021-01-01 16:25:58.447000+00:00,NaT,,,,9574155,,1,2,javascript|d3.js,38
2,65526490,Is there a C macro that replaces varied length...,<p>I want to be able to:</p>\n<pre><code>#defi...,65526541,1,1,,2021-01-01 00:20:09.553000+00:00,0.0,2021-01-01 00:37:24.277000+00:00,NaT,,,,12743240,,1,2,c|macros,35
3,65526419,How can I construct my objects allocated throu...,<p>C++20 removed the <code>construct()</code> ...,65526554,2,0,,2021-01-01 00:05:43.627000+00:00,,2021-01-01 18:19:46.870000+00:00,2021-01-01 05:23:28.560000+00:00,,65863.0,,14694500,,1,3,c++|std|c++20|allocator,178
4,65526523,Navigate from parent to child in react where e...,<p>In a React app with a parent and child elem...,65526577,1,1,,2021-01-01 00:30:31.933000+00:00,,2021-01-01 00:46:22.813000+00:00,2021-01-01 00:35:53.623000+00:00,,8690857.0,,3002584,,1,1,reactjs|react-router|react-router-dom,33
5,65526451,How do I define a field in a subclass by stric...,<p>I'm trying to implement the UML diagram bel...,65526589,1,0,,2021-01-01 00:11:17.787000+00:00,,2021-01-01 10:22:51.790000+00:00,2021-01-01 10:22:51.790000+00:00,,2458991.0,,14634129,,1,1,java|inheritance|uml,50
6,65526591,How to install Python 3.5.x on Ubuntu 18.04 LT...,<p>Simple question: How to install Python 3.5....,65526611,2,1,,2021-01-01 00:49:45.320000+00:00,,2021-01-01 09:48:41.703000+00:00,NaT,,,,6305105,,1,0,python|ubuntu,73
7,65526474,GDB: Displaying incorrect values in struct,<p>I'm trying to implement the <code>malloc</c...,65526637,1,0,,2021-01-01 00:16:10.673000+00:00,,2021-01-01 01:02:22.447000+00:00,2021-01-01 00:39:32.250000+00:00,,1233251.0,,11035194,,1,1,struct|gdb|memory-address,32
8,65526590,Vue2 + laravel6 - Component implementation,<p>I just started using Vue2 with Laravel6 and...,65526639,1,0,,2021-01-01 00:49:32.030000+00:00,,2021-01-01 01:09:45.250000+00:00,2021-01-01 01:07:22.133000+00:00,,11960598.0,,14605909,,1,0,laravel|vue.js|vuejs2|laravel-6,37
9,65526522,Mobile menu css,<p>What's the best way to\nachieve going from ...,65526667,2,0,,2021-01-01 00:30:28.633000+00:00,,2021-01-01 01:46:35.280000+00:00,NaT,,,,6787542,,1,0,css,38


In [15]:
# Pull only relevant columns to ML model
# Sample data limited to June 2021, will scale up for next steps
query_job = client.query(
        """
        SELECT
        id, accepted_answer_id, creation_date, tags
        FROM `bigquery-public-data.stackoverflow.posts_questions`
        WHERE accepted_answer_id is not null AND creation_date > '2021-05-01'
        ORDER BY accepted_answer_id"""
    )

results = query_job.result()  # Waits for job to complete.
    # [END bigquery_simple_app_query]

    # [START bigquery_simple_app_print]
# for row in results:
#     print(row)
results_list = [row for row in results]

In [16]:
cols = ['id', 'accepted_answer_id', 'question_creation_date', 'tags']

In [17]:
post_questions = pd.DataFrame.from_records(results_list, columns=cols)
post_questions

Unnamed: 0,id,accepted_answer_id,question_creation_date,tags
0,67341742,67341801,2021-05-01 00:03:03.923000+00:00,javascript|html|node.js|obs
1,67341817,67341857,2021-05-01 00:17:39.380000+00:00,z3|smt|formal-methods|satisfiability|cvc4
2,67341895,67341911,2021-05-01 00:32:18.187000+00:00,python|numpy
3,67341936,67341961,2021-05-01 00:41:17.673000+00:00,python
4,67341921,67341974,2021-05-01 00:38:52.743000+00:00,laravel-8|vuejs3
...,...,...,...,...
47052,67531286,67757704,2021-05-14 08:28:20.750000+00:00,python|random
47053,67757579,67757716,2021-05-30 04:02:47.097000+00:00,postgresql|haskell
47054,67757723,67757776,2021-05-30 04:38:50.357000+00:00,html|google-sheets
47055,67757792,67757830,2021-05-30 04:50:54.750000+00:00,javascript|regex


In [18]:
query_job = client.query(
        """
        SELECT
        *
        FROM `bigquery-public-data.stackoverflow.posts_answers`
        WHERE creation_date > '2021-01-01'
        ORDER BY id
        LIMIT 10"""
    )

results = query_job.result()  # Waits for job to complete.
    # [END bigquery_simple_app_query]

    # [START bigquery_simple_app_print]
# for row in results:
#     print(row)
results_list = [row for row in results]

In [19]:
cols = [field.name for field in results.schema]

In [20]:
post_answers = pd.DataFrame.from_records(results_list, columns=cols)
post_answers

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,65526388,,<p>Here is one solution. You can first do a gr...,,,0,,2021-01-01 00:00:01.653000+00:00,,2021-01-01 00:00:01.653000+00:00,NaT,,,,4520420,65525964,2,2,,
1,65526389,,<p>Echoing from the individual threads can be ...,,,0,,2021-01-01 00:00:11.187000+00:00,,2021-01-01 00:00:11.187000+00:00,NaT,,,,342497,54217345,2,0,,
2,65526391,,"<p>I have found similar code here (<a href=""ht...",,,5,,2021-01-01 00:00:41.780000+00:00,,2021-01-01 00:00:41.780000+00:00,NaT,,,,13436156,65526354,2,0,,
3,65526393,,"<p>Encountered the same error, just wanna add ...",,,0,,2021-01-01 00:01:25.723000+00:00,,2021-01-01 00:01:25.723000+00:00,NaT,,,,12552972,48184969,2,0,,
4,65526394,,<p>It's difficult to have all three types of v...,,,6,,2021-01-01 00:01:33.950000+00:00,,2021-01-01 00:01:33.950000+00:00,NaT,,,,12602208,65525938,2,0,,
5,65526395,,<p>to be able to change other user passwords y...,,,0,,2021-01-01 00:01:32.977000+00:00,,2021-01-01 00:01:32.977000+00:00,NaT,,,,5562372,65523941,2,0,,
6,65526398,,<blockquote>\n<p>Is it safe (and possible?) to...,,,2,,2021-01-01 00:01:52.123000+00:00,,2021-01-01 00:09:28.470000+00:00,2021-01-01 00:09:28.470000+00:00,,14660.0,,14660,65526234,2,1,,
7,65526403,,<pre><code>def shuffle(deck):\n split_deck ...,,,0,,2021-01-01 00:02:34.570000+00:00,,2021-01-01 00:02:34.570000+00:00,NaT,,,,4718350,65526315,2,1,,
8,65526404,,"<p>FYI, here's a simple app that will do what ...",,,1,,2021-01-01 00:02:40.090000+00:00,,2021-01-01 00:09:09.673000+00:00,2021-01-01 00:09:09.673000+00:00,,816620.0,,816620,65514069,2,0,,
9,65526405,,<p>First you'll want to check if there is no r...,,,0,,2021-01-01 00:02:40.743000+00:00,,2021-01-01 00:22:23.763000+00:00,2021-01-01 00:22:23.763000+00:00,,14560865.0,,14560865,65526331,2,2,,


In [21]:
# Pull only relevant columns to ML model
query_job = client.query(
        """
        SELECT
        id, creation_date
        FROM `bigquery-public-data.stackoverflow.posts_answers`
        WHERE creation_date > '2021-05-01'
        ORDER BY id"""
    )

results = query_job.result()  # Waits for job to complete.
    # [END bigquery_simple_app_query]

    # [START bigquery_simple_app_print]
# for row in results:
#     print(row)
results_list = [row for row in results]

In [22]:
cols = ['answer_id', 'answer_creation_date']

In [23]:
post_answers = pd.DataFrame.from_records(results_list, columns=cols)
post_answers

Unnamed: 0,answer_id,answer_creation_date
0,67341731,2021-05-01 00:00:43.057000+00:00
1,67341735,2021-05-01 00:02:14.853000+00:00
2,67341736,2021-05-01 00:02:18.417000+00:00
3,67341740,2021-05-01 00:02:59.713000+00:00
4,67341741,2021-05-01 00:03:03.637000+00:00
...,...,...
169650,67757977,2021-05-30 05:25:20.073000+00:00
169651,67757978,2021-05-30 05:25:23.510000+00:00
169652,67757979,2021-05-30 05:25:34.620000+00:00
169653,67757981,2021-05-30 05:25:46.913000+00:00


In [24]:
post_questions = post_questions.rename(columns={"id":"parent_id"})
post_questions

Unnamed: 0,parent_id,accepted_answer_id,question_creation_date,tags
0,67341742,67341801,2021-05-01 00:03:03.923000+00:00,javascript|html|node.js|obs
1,67341817,67341857,2021-05-01 00:17:39.380000+00:00,z3|smt|formal-methods|satisfiability|cvc4
2,67341895,67341911,2021-05-01 00:32:18.187000+00:00,python|numpy
3,67341936,67341961,2021-05-01 00:41:17.673000+00:00,python
4,67341921,67341974,2021-05-01 00:38:52.743000+00:00,laravel-8|vuejs3
...,...,...,...,...
47052,67531286,67757704,2021-05-14 08:28:20.750000+00:00,python|random
47053,67757579,67757716,2021-05-30 04:02:47.097000+00:00,postgresql|haskell
47054,67757723,67757776,2021-05-30 04:38:50.357000+00:00,html|google-sheets
47055,67757792,67757830,2021-05-30 04:50:54.750000+00:00,javascript|regex


In [25]:
merged_df = pd.merge(post_questions, post_answers, left_on=  ['accepted_answer_id'],
                   right_on= ['answer_id'], 
                   how = 'left')
merged_df

Unnamed: 0,parent_id,accepted_answer_id,question_creation_date,tags,answer_id,answer_creation_date
0,67341742,67341801,2021-05-01 00:03:03.923000+00:00,javascript|html|node.js|obs,67341801,2021-05-01 00:14:38.340000+00:00
1,67341817,67341857,2021-05-01 00:17:39.380000+00:00,z3|smt|formal-methods|satisfiability|cvc4,67341857,2021-05-01 00:26:39.053000+00:00
2,67341895,67341911,2021-05-01 00:32:18.187000+00:00,python|numpy,67341911,2021-05-01 00:36:54.910000+00:00
3,67341936,67341961,2021-05-01 00:41:17.673000+00:00,python,67341961,2021-05-01 00:46:37.893000+00:00
4,67341921,67341974,2021-05-01 00:38:52.743000+00:00,laravel-8|vuejs3,67341974,2021-05-01 00:48:52.390000+00:00
...,...,...,...,...,...,...
47052,67531286,67757704,2021-05-14 08:28:20.750000+00:00,python|random,67757704,2021-05-30 04:35:06.557000+00:00
47053,67757579,67757716,2021-05-30 04:02:47.097000+00:00,postgresql|haskell,67757716,2021-05-30 04:37:58.840000+00:00
47054,67757723,67757776,2021-05-30 04:38:50.357000+00:00,html|google-sheets,67757776,2021-05-30 04:48:57.807000+00:00
47055,67757792,67757830,2021-05-30 04:50:54.750000+00:00,javascript|regex,67757830,2021-05-30 04:57:39.123000+00:00


In [26]:
# Compare accepted_answer_id and answer_id to verify identical
merged_df['accepted_answer_id'].equals(merged_df['answer_id'])

True

In [27]:
# Drop "answer_id" column because redundant
merged_df = merged_df.drop(columns=['answer_id'])
merged_df

Unnamed: 0,parent_id,accepted_answer_id,question_creation_date,tags,answer_creation_date
0,67341742,67341801,2021-05-01 00:03:03.923000+00:00,javascript|html|node.js|obs,2021-05-01 00:14:38.340000+00:00
1,67341817,67341857,2021-05-01 00:17:39.380000+00:00,z3|smt|formal-methods|satisfiability|cvc4,2021-05-01 00:26:39.053000+00:00
2,67341895,67341911,2021-05-01 00:32:18.187000+00:00,python|numpy,2021-05-01 00:36:54.910000+00:00
3,67341936,67341961,2021-05-01 00:41:17.673000+00:00,python,2021-05-01 00:46:37.893000+00:00
4,67341921,67341974,2021-05-01 00:38:52.743000+00:00,laravel-8|vuejs3,2021-05-01 00:48:52.390000+00:00
...,...,...,...,...,...
47052,67531286,67757704,2021-05-14 08:28:20.750000+00:00,python|random,2021-05-30 04:35:06.557000+00:00
47053,67757579,67757716,2021-05-30 04:02:47.097000+00:00,postgresql|haskell,2021-05-30 04:37:58.840000+00:00
47054,67757723,67757776,2021-05-30 04:38:50.357000+00:00,html|google-sheets,2021-05-30 04:48:57.807000+00:00
47055,67757792,67757830,2021-05-30 04:50:54.750000+00:00,javascript|regex,2021-05-30 04:57:39.123000+00:00


In [28]:
# Add column for day of question_creation_date [question_day]
merged_df['question_day'] = merged_df['question_creation_date'].dt.day_name()
merged_df

Unnamed: 0,parent_id,accepted_answer_id,question_creation_date,tags,answer_creation_date,question_day
0,67341742,67341801,2021-05-01 00:03:03.923000+00:00,javascript|html|node.js|obs,2021-05-01 00:14:38.340000+00:00,Saturday
1,67341817,67341857,2021-05-01 00:17:39.380000+00:00,z3|smt|formal-methods|satisfiability|cvc4,2021-05-01 00:26:39.053000+00:00,Saturday
2,67341895,67341911,2021-05-01 00:32:18.187000+00:00,python|numpy,2021-05-01 00:36:54.910000+00:00,Saturday
3,67341936,67341961,2021-05-01 00:41:17.673000+00:00,python,2021-05-01 00:46:37.893000+00:00,Saturday
4,67341921,67341974,2021-05-01 00:38:52.743000+00:00,laravel-8|vuejs3,2021-05-01 00:48:52.390000+00:00,Saturday
...,...,...,...,...,...,...
47052,67531286,67757704,2021-05-14 08:28:20.750000+00:00,python|random,2021-05-30 04:35:06.557000+00:00,Friday
47053,67757579,67757716,2021-05-30 04:02:47.097000+00:00,postgresql|haskell,2021-05-30 04:37:58.840000+00:00,Sunday
47054,67757723,67757776,2021-05-30 04:38:50.357000+00:00,html|google-sheets,2021-05-30 04:48:57.807000+00:00,Sunday
47055,67757792,67757830,2021-05-30 04:50:54.750000+00:00,javascript|regex,2021-05-30 04:57:39.123000+00:00,Sunday


In [29]:
# Add column for hour value of question_creation_date [question_time]
merged_df['question_hour'] = merged_df['question_creation_date'].dt.hour
merged_df['question_hour_min'] = merged_df['question_creation_date'].dt.strftime('%H:%M')

merged_df

Unnamed: 0,parent_id,accepted_answer_id,question_creation_date,tags,answer_creation_date,question_day,question_hour,question_hour_min
0,67341742,67341801,2021-05-01 00:03:03.923000+00:00,javascript|html|node.js|obs,2021-05-01 00:14:38.340000+00:00,Saturday,0,00:03
1,67341817,67341857,2021-05-01 00:17:39.380000+00:00,z3|smt|formal-methods|satisfiability|cvc4,2021-05-01 00:26:39.053000+00:00,Saturday,0,00:17
2,67341895,67341911,2021-05-01 00:32:18.187000+00:00,python|numpy,2021-05-01 00:36:54.910000+00:00,Saturday,0,00:32
3,67341936,67341961,2021-05-01 00:41:17.673000+00:00,python,2021-05-01 00:46:37.893000+00:00,Saturday,0,00:41
4,67341921,67341974,2021-05-01 00:38:52.743000+00:00,laravel-8|vuejs3,2021-05-01 00:48:52.390000+00:00,Saturday,0,00:38
...,...,...,...,...,...,...,...,...
47052,67531286,67757704,2021-05-14 08:28:20.750000+00:00,python|random,2021-05-30 04:35:06.557000+00:00,Friday,8,08:28
47053,67757579,67757716,2021-05-30 04:02:47.097000+00:00,postgresql|haskell,2021-05-30 04:37:58.840000+00:00,Sunday,4,04:02
47054,67757723,67757776,2021-05-30 04:38:50.357000+00:00,html|google-sheets,2021-05-30 04:48:57.807000+00:00,Sunday,4,04:38
47055,67757792,67757830,2021-05-30 04:50:54.750000+00:00,javascript|regex,2021-05-30 04:57:39.123000+00:00,Sunday,4,04:50


In [30]:
# Add column for duration till accepted answer (answer_creation_date - question_creation_date)
merged_df['accepted_answer_duration'] = (merged_df['answer_creation_date']-merged_df['question_creation_date'])/np.timedelta64(1,'h')
merged_df = merged_df.sort_values(by='accepted_answer_duration', ascending=False)
merged_df

Unnamed: 0,parent_id,accepted_answer_id,question_creation_date,tags,answer_creation_date,question_day,question_hour,question_hour_min,accepted_answer_duration
46975,67342099,67756344,2021-05-01 01:19:14.210000+00:00,ios|swift|uibezierpath|rounded-corners|spacing,2021-05-29 22:57:23.117000+00:00,Saturday,1,01:19,693.635808
45777,67349217,67741551,2021-05-01 17:43:24.997000+00:00,javascript|css,2021-05-28 15:18:48.583000+00:00,Saturday,17,17:43,645.589885
46098,67352708,67744939,2021-05-02 03:20:42.240000+00:00,python|django,2021-05-28 20:10:38.163000+00:00,Sunday,3,03:20,640.832201
41532,67349112,67699250,2021-05-01 17:34:03.697000+00:00,django|class|django-rest-framework,2021-05-26 05:57:48.013000+00:00,Saturday,17,17:34,588.395643
46603,67399034,67750961,2021-05-05 09:56:23.113000+00:00,java|drag-and-drop|awt|netbeans-platform,2021-05-29 11:44:36.820000+00:00,Wednesday,9,09:56,577.803807
...,...,...,...,...,...,...,...,...,...
25782,67563037,67563038,2021-05-17 01:35:04.927000+00:00,android|android-studio|android-layout|keyboard...,2021-05-17 01:35:04.927000+00:00,Monday,1,01:35,0.000000
16408,67485503,67485504,2021-05-11 11:05:40.837000+00:00,kubernetes|apache-kafka|apache-zookeeper|yahoo...,2021-05-11 11:05:40.837000+00:00,Tuesday,11,11:05,0.000000
37859,67666770,67666771,2021-05-24 04:42:53.953000+00:00,mpdf,2021-05-24 04:42:53.953000+00:00,Monday,4,04:42,0.000000
25652,67561998,67561999,2021-05-16 22:14:34.137000+00:00,c#|formatting|t4,2021-05-16 22:14:34.137000+00:00,Sunday,22,22:14,0.000000


In [31]:
# Additional method to verify number of rows with zero response time (what's the possible explanation?)
zero_time = merged_df.groupby('accepted_answer_duration').count()
zero_time 

Unnamed: 0_level_0,parent_id,accepted_answer_id,question_creation_date,tags,answer_creation_date,question_day,question_hour,question_hour_min
accepted_answer_duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.000000,172,172,172,172,172,172,172,172
0.004931,1,1,1,1,1,1,1,1
0.005020,1,1,1,1,1,1,1,1
0.006600,1,1,1,1,1,1,1,1
0.007744,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...
577.803807,1,1,1,1,1,1,1,1
588.395643,1,1,1,1,1,1,1,1
640.832201,1,1,1,1,1,1,1,1
645.589885,1,1,1,1,1,1,1,1


In [32]:
# drop data rows where duration is 0
indexNames = merged_df[ merged_df['accepted_answer_duration'] == 0.000000 ].index
merged_df.drop(indexNames, inplace=True)
merged_df

Unnamed: 0,parent_id,accepted_answer_id,question_creation_date,tags,answer_creation_date,question_day,question_hour,question_hour_min,accepted_answer_duration
46975,67342099,67756344,2021-05-01 01:19:14.210000+00:00,ios|swift|uibezierpath|rounded-corners|spacing,2021-05-29 22:57:23.117000+00:00,Saturday,1,01:19,693.635808
45777,67349217,67741551,2021-05-01 17:43:24.997000+00:00,javascript|css,2021-05-28 15:18:48.583000+00:00,Saturday,17,17:43,645.589885
46098,67352708,67744939,2021-05-02 03:20:42.240000+00:00,python|django,2021-05-28 20:10:38.163000+00:00,Sunday,3,03:20,640.832201
41532,67349112,67699250,2021-05-01 17:34:03.697000+00:00,django|class|django-rest-framework,2021-05-26 05:57:48.013000+00:00,Saturday,17,17:34,588.395643
46603,67399034,67750961,2021-05-05 09:56:23.113000+00:00,java|drag-and-drop|awt|netbeans-platform,2021-05-29 11:44:36.820000+00:00,Wednesday,9,09:56,577.803807
...,...,...,...,...,...,...,...,...,...
12265,67450739,67450745,2021-05-08 18:12:53.640000+00:00,r|dataframe|ggplot2,2021-05-08 18:13:25.640000+00:00,Saturday,18,18:12,0.008889
12316,67451178,67451182,2021-05-08 19:03:44.257000+00:00,r|string,2021-05-08 19:04:12.137000+00:00,Saturday,19,19:03,0.007744
19631,67512565,67512567,2021-05-13 00:35:23.640000+00:00,sql|sql-server|tsql,2021-05-13 00:35:47.400000+00:00,Thursday,0,00:35,0.006600
8154,67415161,67415165,2021-05-06 09:13:30.220000+00:00,spring-boot|mybatis|lombok,2021-05-06 09:13:48.293000+00:00,Thursday,9,09:13,0.005020


In [33]:
# create a subset of merged_df for ML model
#response_duration_ML_df = merged_df['accepted_answer_id', 'question_day', 'question_hour', 'question_hour_min', 'accepted_answer_duration']
#response_duration_ML_df

practice_ML_df = merged_df[['accepted_answer_id', 'question_creation_date', 'question_day', 'question_hour', 'question_hour_min', 'accepted_answer_duration']]
practice_ML_df

Unnamed: 0,accepted_answer_id,question_creation_date,question_day,question_hour,question_hour_min,accepted_answer_duration
46975,67756344,2021-05-01 01:19:14.210000+00:00,Saturday,1,01:19,693.635808
45777,67741551,2021-05-01 17:43:24.997000+00:00,Saturday,17,17:43,645.589885
46098,67744939,2021-05-02 03:20:42.240000+00:00,Sunday,3,03:20,640.832201
41532,67699250,2021-05-01 17:34:03.697000+00:00,Saturday,17,17:34,588.395643
46603,67750961,2021-05-05 09:56:23.113000+00:00,Wednesday,9,09:56,577.803807
...,...,...,...,...,...,...
12265,67450745,2021-05-08 18:12:53.640000+00:00,Saturday,18,18:12,0.008889
12316,67451182,2021-05-08 19:03:44.257000+00:00,Saturday,19,19:03,0.007744
19631,67512567,2021-05-13 00:35:23.640000+00:00,Thursday,0,00:35,0.006600
8154,67415165,2021-05-06 09:13:30.220000+00:00,Thursday,9,09:13,0.005020


In [34]:
practice_ML_df.dtypes

accepted_answer_id                        int64
question_creation_date      datetime64[ns, UTC]
question_day                             object
question_hour                             int64
question_hour_min                        object
accepted_answer_duration                float64
dtype: object

In [35]:
#key to convert day of week to numerical value
day_to_number={"Sunday":1,
              "Monday":2,
               "Tuesday":3,
               "Wednesday":4,
               "Thursday":5,
               "Friday":6,
               "Saturday":7
              }

In [36]:
#define transformation fuction
def day_convert(x):
    return day_to_number[x]

In [37]:
#apply function to column
practice_ML_df["question_day"]=practice_ML_df["question_day"].apply(day_convert)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  practice_ML_df["question_day"]=practice_ML_df["question_day"].apply(day_convert)


In [38]:
practice_ML_df.head()

Unnamed: 0,accepted_answer_id,question_creation_date,question_day,question_hour,question_hour_min,accepted_answer_duration
46975,67756344,2021-05-01 01:19:14.210000+00:00,7,1,01:19,693.635808
45777,67741551,2021-05-01 17:43:24.997000+00:00,7,17,17:43,645.589885
46098,67744939,2021-05-02 03:20:42.240000+00:00,1,3,03:20,640.832201
41532,67699250,2021-05-01 17:34:03.697000+00:00,7,17,17:34,588.395643
46603,67750961,2021-05-05 09:56:23.113000+00:00,4,9,09:56,577.803807


In [39]:
practice_ML_df.dtypes

accepted_answer_id                        int64
question_creation_date      datetime64[ns, UTC]
question_day                              int64
question_hour                             int64
question_hour_min                        object
accepted_answer_duration                float64
dtype: object

In [40]:
#adding column to parse minute of the hour to get minute of the day
practice_ML_df["question_hour_min"]=merged_df['question_creation_date'].dt.minute

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  practice_ML_df["question_hour_min"]=merged_df['question_creation_date'].dt.minute


In [41]:
practice_ML_df.head()

Unnamed: 0,accepted_answer_id,question_creation_date,question_day,question_hour,question_hour_min,accepted_answer_duration
46975,67756344,2021-05-01 01:19:14.210000+00:00,7,1,19,693.635808
45777,67741551,2021-05-01 17:43:24.997000+00:00,7,17,43,645.589885
46098,67744939,2021-05-02 03:20:42.240000+00:00,1,3,20,640.832201
41532,67699250,2021-05-01 17:34:03.697000+00:00,7,17,34,588.395643
46603,67750961,2021-05-05 09:56:23.113000+00:00,4,9,56,577.803807


In [42]:
#adding column to show minute of the day question was asked

practice_ML_df["question_time"]=(practice_ML_df["question_hour"]*60)+(practice_ML_df["question_hour_min"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  practice_ML_df["question_time"]=(practice_ML_df["question_hour"]*60)+(practice_ML_df["question_hour_min"])


In [43]:
practice_ML_df.head()

Unnamed: 0,accepted_answer_id,question_creation_date,question_day,question_hour,question_hour_min,accepted_answer_duration,question_time
46975,67756344,2021-05-01 01:19:14.210000+00:00,7,1,19,693.635808,79
45777,67741551,2021-05-01 17:43:24.997000+00:00,7,17,43,645.589885,1063
46098,67744939,2021-05-02 03:20:42.240000+00:00,1,3,20,640.832201,200
41532,67699250,2021-05-01 17:34:03.697000+00:00,7,17,34,588.395643,1054
46603,67750961,2021-05-05 09:56:23.113000+00:00,4,9,56,577.803807,596


In [44]:
#consolidating df

practice_ML_df=practice_ML_df[["accepted_answer_id","question_creation_date","question_day","question_hour","question_time","accepted_answer_duration"]]

In [45]:
practice_ML_df.head()

Unnamed: 0,accepted_answer_id,question_creation_date,question_day,question_hour,question_time,accepted_answer_duration
46975,67756344,2021-05-01 01:19:14.210000+00:00,7,1,79,693.635808
45777,67741551,2021-05-01 17:43:24.997000+00:00,7,17,1063,645.589885
46098,67744939,2021-05-02 03:20:42.240000+00:00,1,3,200,640.832201
41532,67699250,2021-05-01 17:34:03.697000+00:00,7,17,1054,588.395643
46603,67750961,2021-05-05 09:56:23.113000+00:00,4,9,596,577.803807


# Analysis starts here

## Scatterplot to show relationship between time/day and question duration

In [46]:
#scatterplot to show relationship between time/day and question duration

plt.figure(figsize=(20, 5))

features = ['question_hour', 'question_day']
target = practice_ML_df['accepted_answer_duration']

for i, col in enumerate(features):
    plt.subplot(1, len(features) , i+1)
    x = practice_ML_df[col]
    y = target
    plt.scatter(x, y, marker='o')
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel("Accepted_Answer_Duration")

NameError: name 'plt' is not defined

## Regression Analysis using question_hour as feature

In [47]:
#pandas.DatetimeIndex.dayofweek - dont use datetimeindex because the DT is not an index
# post_questions['creation_day'] = pd.DatetimeIndex.dayofweek(post_questions['creation_date'])

## pseudo code
X = practice_ML_df[['question_hour']]
y = practice_ML_df['accepted_answer_duration']


In [48]:
plt.scatter(X, y)

NameError: name 'plt' is not defined

In [49]:
 # Create the model and fit the model to the data

from sklearn.linear_model import LinearRegression

model = LinearRegression()

X = X.values.reshape(-1, 1)

In [50]:
model.fit(X, y)

LinearRegression()

In [51]:
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

Weight coefficients:  [0.00815469]
y-axis intercept:  11.884090912541119


In [52]:
x_min = np.array([[X.min()]])
x_max = np.array([[X.max()]])
print(f"Min X Value: {x_min}")
print(f"Max X Value: {x_max}")

Min X Value: [[0]]
Max X Value: [[23]]


In [53]:
y_min = model.predict(x_min)
y_max = model.predict(x_max)

In [54]:
plt.scatter(X, y, c='blue')
plt.plot([x_min[0], x_max[0]], [y_min[0], y_max[0]], c='red')

NameError: name 'plt' is not defined

In [55]:
from sklearn.metrics import mean_squared_error

mse = (np.sqrt(mean_squared_error(X,y)))

In [56]:
print('MSE is {}'.format(rmse))

NameError: name 'rmse' is not defined

There is no correlation between hour posted and answer duration, try with more continuous data

## Scatterplot to show relationship between minute of the day and approved answer response

<b>question_hour_min</b> was converted into minute of the day with 12 am (00:00) being 0 minutes (see column <b>question_time</b>)

In [57]:
#scatterplot to show relationship between time/day and question duration 

plt.figure(figsize=(20, 5))

features = ['question_time']
target = practice_ML_df['accepted_answer_duration']

for i, col in enumerate(features):
    plt.subplot(1, len(features) , i+1)
    x = practice_ML_df[col]
    y = target
    plt.scatter(x, y, marker='o')
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel("Accepted_Answer_Duration")

NameError: name 'plt' is not defined

## Regression Analysis using question by minute of day

In [58]:
#pandas.DatetimeIndex.dayofweek - dont use datetimeindex because the DT is not an index
# post_questions['creation_day'] = pd.DatetimeIndex.dayofweek(post_questions['creation_date'])

## pseudo code
X = practice_ML_df[['question_time']]
y = practice_ML_df['accepted_answer_duration']

In [59]:
plt.scatter(X, y)

NameError: name 'plt' is not defined

In [60]:
 # Create the model and fit the model to the data

from sklearn.linear_model import LinearRegression

model = LinearRegression()

X = X.values.reshape(-1, 1)

In [61]:
model.fit(X, y)

LinearRegression()

In [62]:
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

Weight coefficients:  [0.00014666]
y-axis intercept:  11.871789739227022


In [63]:
x_min = np.array([[X.min()]])
x_max = np.array([[X.max()]])
print(f"Min X Value: {x_min}")
print(f"Max X Value: {x_max}")

Min X Value: [[0]]
Max X Value: [[1439]]


In [64]:
y_min = model.predict(x_min)
y_max = model.predict(x_max)

In [65]:
plt.scatter(X, y, c='blue')
plt.plot([x_min[0], x_max[0]], [y_min[0], y_max[0]], c='red')

NameError: name 'plt' is not defined

There is no correlation between the time of day the question is posted and the approved response duration <br><br>

<b>Next Steps:</b>
- try to see if there is a relationship between tags and answer_duration

- reframe question, use ML to determine whether a question has an approved answer or not

## Multiple Linear Regression

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [73]:
X = practice_ML_df[['question_time','question_day']]
y = practice_ML_df['accepted_answer_duration']
X.head(10)

In [74]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size =0.2)
# print the data


In [69]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()

In [70]:
clf.fit(X_train,y_train)

LinearRegression()

In [71]:
clf.predict(X_test)

array([11.67770788, 12.17600395, 11.64133887, ..., 11.91402154,
       11.82845906, 12.42678262])

In [72]:
clf.score(X_test,y_test)

-0.00025979394326269833

## 2nd Multiple Linear Regression Model with merged_df adding tags

In [None]:
X = merged_df[['question_hour','no_of_tags', ]]
y = merged_df['accepted_answer_duration']
X.head(10)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size =0.2)
# print the data
X_train

In [None]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()