In [1]:
# Importing dependencies
! pip install google-cloud-bigquery
! pip install psycopg2-binary
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
from config_db import db_password



In [2]:
# Setting up the environment variable
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="data.json"

In [3]:
# Initializing a BigQuery client to authenticate and connect to BigQuery API
client = bigquery.Client()

In [4]:
# StackOverflow dataset from BigQuery
hn_dataset_ref = client.dataset('stackoverflow', project='bigquery-public-data')
hn_dset = client.get_dataset(hn_dataset_ref)

In [5]:
# Exploring available data tables from StackOverflow dataset
[x.table_id for x in client.list_tables(hn_dset)]

['badges',
 'comments',
 'post_history',
 'post_links',
 'posts_answers',
 'posts_moderator_nomination',
 'posts_orphaned_tag_wiki',
 'posts_privilege_wiki',
 'posts_questions',
 'posts_tag_wiki',
 'posts_tag_wiki_excerpt',
 'posts_wiki_placeholder',
 'stackoverflow_posts',
 'tags',
 'users',
 'votes']

In [6]:
# Querying for questions with accepted answers that were posted on and after 01/01/2021
query_job = client.query(
        """
        SELECT
        id, title, body, accepted_answer_id, creation_date, favorite_count, score, tags, view_count
        FROM `bigquery-public-data.stackoverflow.posts_questions`
        WHERE accepted_answer_id is not null AND creation_date >= '2021-01-01'
        ORDER BY accepted_answer_id"""
    )

results = query_job.result() 
results_list = [row for row in results]
cols = ['q_id', 'q_title', 'q_body', 'accepted_answer_id', 'q_creation_date', 'q_favorite_count', 'q_score', 'q_tags', 'q_view_count']
posts_questions_df = pd.DataFrame.from_records(results_list, columns=cols)
posts_questions_df.sample(n=15)

Unnamed: 0,q_id,q_title,q_body,accepted_answer_id,q_creation_date,q_favorite_count,q_score,q_tags,q_view_count
296564,67818234,"""requests-html"" proxy setting not working",<p>I'm using the following code to set the pro...,67819469,2021-06-03 08:44:09.363000+00:00,,1,python-requests-html,112
417222,68781251,"Ignoring folder in .gitignore, what's the diff...","<p>In .gitignore, we can ignore a folder using...",68781268,2021-08-14 07:28:56.383000+00:00,,0,git,31
296775,67821000,How to check If every Cell in Range found on a...,<p>I have data on Worksheets(&quot;North&quot;...,67821187,2021-06-03 11:52:42.743000+00:00,1.0,0,excel|vba,41
205766,67108254,How can I intercept the output stream and check?,<p>How can I intercept the output stream and c...,67108422,2021-04-15 12:14:09.527000+00:00,,0,bash|shell|sh,26
287701,67671660,Extraneous space between characters/glyphs in ...,<p>I am running ubuntu 16.04 (it is on an OLD ...,67749313,2021-05-24 11:54:15.797000+00:00,,1,fonts|ubuntu-16.04|symbols|spacing|groff,60
309271,67916165,How can I attach the corresponding HTML ending...,<p>In short I am making a program which scrape...,67920800,2021-06-10 06:55:01.810000+00:00,,0,python|regex|loops|web-scraping|beautifulsoup,21
88135,66209128,Including null inside PySpark isin,<p>This is my dataframe:</p>\n<pre><code>from ...,66209354,2021-02-15 13:39:20.173000+00:00,,1,python|apache-spark|pyspark|apache-spark-sql|isin,277
350583,68248708,What is a correct relational database design f...,<p>I am working on a database design where I h...,68248843,2021-07-04 20:55:44.663000+00:00,,1,mysql|database|database-design|relational-data...,28
391499,68572063,Return URL of locally stored image [Node JS API],<p>I'm create an API that receive base64 strin...,68573466,2021-07-29 07:46:51.150000+00:00,1.0,1,javascript|node.js|express|fs,36
22608,65668782,Dropdown Item not making api call on first try,<p>Im having an issue where when I select a ne...,65707240,2021-01-11 14:32:05.810000+00:00,,0,reactjs|drop-down-menu|state|use-state,57


In [7]:
# Determine data types for posts_questions_df
posts_questions_df.dtypes

q_id                                int64
q_title                            object
q_body                             object
accepted_answer_id                  int64
q_creation_date       datetime64[ns, UTC]
q_favorite_count                  float64
q_score                             int64
q_tags                             object
q_view_count                        int64
dtype: object

In [8]:
# Determine shape (number of rows/columns) for posts_questions_df
posts_questions_df.shape

(449605, 9)

In [9]:
# Explore unique values
posts_questions_df.nunique()

q_id                  449605
q_title               449601
q_body                449582
accepted_answer_id    449605
q_creation_date       449581
q_favorite_count          27
q_score                   92
q_tags                230332
q_view_count            3217
dtype: int64

In [10]:
# Set index to q_id (ID of question post)
posts_questions_df = posts_questions_df.set_index("q_id")
posts_questions_df.head()

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_favorite_count,q_score,q_tags,q_view_count
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
65526420,How to check if values in individiual rows of ...,<p>Suppose I have the following data.table:</p...,65526457,2021-01-01 00:05:46.310000+00:00,2.0,2,r|data.table|rowwise,62
65526423,d3.js : Generating axis ticks for ordinal values,<p>I want to use ordinal scale in x-axis with ...,65526533,2021-01-01 00:06:09.007000+00:00,,2,javascript|d3.js,48
65526490,Is there a C macro that replaces varied length...,<p>I want to be able to:</p>\n<pre><code>#defi...,65526541,2021-01-01 00:20:09.553000+00:00,0.0,2,c|macros,35
65526419,How can I construct my objects allocated throu...,<p>C++20 removed the <code>construct()</code> ...,65526554,2021-01-01 00:05:43.627000+00:00,,3,c++|std|c++20|allocator,351
65526523,Navigate from parent to child in react where e...,<p>In a React app with a parent and child elem...,65526577,2021-01-01 00:30:31.933000+00:00,,2,reactjs|react-router|react-router-dom,117


In [11]:
# Determine if there are many missing values in the posts_questions_df
[[column,posts_questions_df[column].isnull().sum()] for column in posts_questions_df.columns]

[['q_title', 0],
 ['q_body', 0],
 ['accepted_answer_id', 0],
 ['q_creation_date', 0],
 ['q_favorite_count', 409078],
 ['q_score', 0],
 ['q_tags', 0],
 ['q_view_count', 0]]

In [12]:
# Determine percentage of rows with missing value in q_favorite_count column
percent_null = (posts_questions_df["q_favorite_count"].isnull().sum() / len(posts_questions_df)) * 100
percent_null

90.98608778816961

In [13]:
# Drop q_favorite_count column and verify
posts_questions_df = posts_questions_df.drop(columns='q_favorite_count')
posts_questions_df.columns

Index(['q_title', 'q_body', 'accepted_answer_id', 'q_creation_date', 'q_score',
       'q_tags', 'q_view_count'],
      dtype='object')

In [14]:
# Confirm no missing values in dataframe
[[column,posts_questions_df[column].isnull().sum()] for column in posts_questions_df.columns]

[['q_title', 0],
 ['q_body', 0],
 ['accepted_answer_id', 0],
 ['q_creation_date', 0],
 ['q_score', 0],
 ['q_tags', 0],
 ['q_view_count', 0]]

In [15]:
# Additional method to verify number of rows with zero response time (what's the possible explanation?)
q_score_explore_raw = posts_questions_df.groupby('q_score').count()
q_score_explore_raw

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_tags,q_view_count
q_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-18,1,1,1,1,1,1
-12,1,1,1,1,1,1
-11,2,2,2,2,2,2
-10,6,6,6,6,6,6
-9,9,9,9,9,9,9
...,...,...,...,...,...,...
153,2,2,2,2,2,2
164,2,2,2,2,2,2
201,1,1,1,1,1,1
216,1,1,1,1,1,1


In [16]:
# Bin score for more meaningful analysis
# Establish the bins.
q_score_bins = [-10, -1, 0, 10, 25, 50, 101]
q_score_group_names = ["Negative Score (<0)", "Zero Score (0)", "Low Score (0-10)", "Medium Score (10 - 25)", "High Score (25-50)", "Popular Score (50-101)"]

# Categorize score based on the bins.
posts_questions_df['q_score_tier'] = pd.cut(posts_questions_df['q_score'], q_score_bins, labels=q_score_group_names)

In [17]:
# explore scores
q_score_explore = posts_questions_df.groupby('q_score_tier').count()
q_score_explore

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count
q_score_tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Negative Score (<0),44388,44388,44388,44388,44388,44388,44388
Zero Score (0),244214,244214,244214,244214,244214,244214,244214
Low Score (0-10),160359,160359,160359,160359,160359,160359,160359
Medium Score (10 - 25),473,473,473,473,473,473,473
High Score (25-50),112,112,112,112,112,112,112
Popular Score (50-101),37,37,37,37,37,37,37


In [18]:
# Bin score for more meaningful analysis (broader bin)

# Establish the bins.
q_score_bins = [-10, -1, 0, 101]
q_score_group_names = ["Negative Score (<0)", "Zero Score (0)", "Positive Score (>0)"]
# Categorize score based on the bins.
posts_questions_df['q_score_tier'] = pd.cut(posts_questions_df['q_score'], q_score_bins, labels=q_score_group_names)
# Bin scores
q_score_explore = posts_questions_df.groupby('q_score_tier').count()
q_score_explore

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count
q_score_tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Negative Score (<0),44388,44388,44388,44388,44388,44388,44388
Zero Score (0),244214,244214,244214,244214,244214,244214,244214
Positive Score (>0),160981,160981,160981,160981,160981,160981,160981


In [19]:
# Verify whether there are redundant columns for score_tier
posts_questions_df.columns

Index(['q_title', 'q_body', 'accepted_answer_id', 'q_creation_date', 'q_score',
       'q_tags', 'q_view_count', 'q_score_tier'],
      dtype='object')

In [20]:
# Verify data in q_score_tier column
posts_questions_df.sample(n=10)

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
66363124,"In Google Sheets, is there a way to override o...","<p>Here is a <a href=""https://docs.google.com/...",66364688,2021-02-25 05:54:04+00:00,2,google-sheets|conditional-formatting,89,Positive Score (>0)
67539223,Cannot convert value class net.sf.saxon.tinytr...,<p>We have a use-case where we are calling jav...,67544270,2021-05-14 18:19:09.697000+00:00,1,java|date|templates|xslt|saxon,43,Positive Score (>0)
66789764,How services api snippets works,<p>I am integrating a website to Facebook Pixe...,66790689,2021-03-24 21:37:34.267000+00:00,0,javascript|reactjs|typescript|facebook-pixel,22,Zero Score (0)
66595875,How to fetch a python file data in Django?,<p>I have a python file:</p>\n<pre><code>impor...,66596129,2021-03-12 07:27:50.120000+00:00,-2,python|django,33,Negative Score (<0)
66639880,How to if h1 text == this then this happens,<p>So I made this IP script were when you get ...,66640141,2021-03-15 14:28:42.243000+00:00,-1,javascript|html|ip,16,Negative Score (<0)
67935510,Material ui card open in new tab on Ctrl+click,"<p>I am using Material ui card, and I have <co...",67944770,2021-06-11 10:37:12.050000+00:00,0,reactjs|material-ui,58,Zero Score (0)
67750930,props not updating in child component after pa...,<p>I am working on a ToDo application and I pa...,67751059,2021-05-29 11:41:17.430000+00:00,0,javascript|reactjs|react-native|react-hooks|cl...,23,Zero Score (0)
68003247,Read a binary file stored in HDFS with Python,<p>I have some binary files.\nWhen i store the...,68003322,2021-06-16 13:05:37.427000+00:00,0,python|hdfs|binaryfiles|python-cryptography,47,Zero Score (0)
68496404,Is there any sql query for select specific dat...,<p>I have table of dates like this one below:<...,68496688,2021-07-23 08:39:02.800000+00:00,0,sql,36,Zero Score (0)
67292312,React limits the number of renders to prevent ...,<p>I have a simple section in which contains a...,67292368,2021-04-28 00:42:23.957000+00:00,0,javascript|reactjs|slick.js|react-slick,33,Zero Score (0)


In [21]:
# Determine length of title
posts_questions_df['q_title_char_count']= posts_questions_df['q_title'].str.len() #character count
posts_questions_df['q_title_word_count']= posts_questions_df['q_title'].str.split().str.len() #word count
posts_questions_df.head()

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
65526420,How to check if values in individiual rows of ...,<p>Suppose I have the following data.table:</p...,65526457,2021-01-01 00:05:46.310000+00:00,2,r|data.table|rowwise,62,Positive Score (>0),72,13
65526423,d3.js : Generating axis ticks for ordinal values,<p>I want to use ordinal scale in x-axis with ...,65526533,2021-01-01 00:06:09.007000+00:00,2,javascript|d3.js,48,Positive Score (>0),48,8
65526490,Is there a C macro that replaces varied length...,<p>I want to be able to:</p>\n<pre><code>#defi...,65526541,2021-01-01 00:20:09.553000+00:00,2,c|macros,35,Positive Score (>0),81,13
65526419,How can I construct my objects allocated throu...,<p>C++20 removed the <code>construct()</code> ...,65526554,2021-01-01 00:05:43.627000+00:00,3,c++|std|c++20|allocator,351,Positive Score (>0),76,9
65526523,Navigate from parent to child in react where e...,<p>In a React app with a parent and child elem...,65526577,2021-01-01 00:30:31.933000+00:00,2,reactjs|react-router|react-router-dom,117,Positive Score (>0),82,14


In [22]:
# Bin by title length (char count)
q_title_explore_char = posts_questions_df.groupby('q_title_char_count').count()
q_title_explore_char

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_word_count
q_title_char_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
15,163,163,163,163,163,163,163,163,163
16,279,279,279,279,279,279,279,279,279
17,342,342,342,342,342,342,342,342,342
18,462,462,462,462,462,462,462,462,462
19,619,619,619,619,619,619,619,619,619
...,...,...,...,...,...,...,...,...,...
146,200,200,200,200,200,200,200,200,200
147,203,203,203,203,203,203,203,203,203
148,220,220,220,220,220,220,220,220,220
149,226,226,226,226,226,226,226,226,226


In [23]:
# Bin title length by char count

# Establish the bins.
q_char_count_bins = [0, 50, 100, 150]
q_char_count_group_names = ["Short (0 - 50)", "Medium (50-100)", "Long (100-150)"]

# Categorize char length based on the bins.
posts_questions_df['q_title_char_count_bin'] = pd.cut(posts_questions_df['q_title_char_count'], q_char_count_bins, labels=q_char_count_group_names)

# Bin by char length
q_char_count_explore = posts_questions_df.groupby('q_title_char_count_bin').count()
q_char_count_explore

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count
q_title_char_count_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Short (0 - 50),175309,175309,175309,175309,175309,175309,175309,175302,175309,175309
Medium (50-100),250297,250297,250297,250297,250297,250297,250297,250284,250297,250297
Long (100-150),23999,23999,23999,23999,23999,23999,23999,23997,23999,23999


In [24]:
# Bin by title length (word count)
q_title_explore_word = posts_questions_df.groupby('q_title_word_count').count()
q_title_explore_word

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_char_count_bin
q_title_word_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,31,31,31,31,31,31,31,31,31,31
2,731,731,731,731,731,731,731,731,731,731
3,5384,5384,5384,5384,5384,5384,5384,5384,5384,5384
4,14809,14809,14809,14809,14809,14809,14809,14809,14809,14809
5,26286,26286,26286,26286,26286,26286,26286,26285,26286,26286
6,38441,38441,38441,38441,38441,38441,38441,38439,38441,38441
7,46820,46820,46820,46820,46820,46820,46820,46816,46820,46820
8,51386,51386,51386,51386,51386,51386,51386,51383,51386,51386
9,50089,50089,50089,50089,50089,50089,50089,50089,50089,50089
10,45341,45341,45341,45341,45341,45341,45341,45338,45341,45341


In [25]:
# Bin title length by word count

# Establish the bins.
q_title_word_count_bins = [0, 10, 20, 30, 40]
q_title_word_count_group_names = ["Short (0 - 10)", "Medium (10-20)", "Long (20-30)", "XL (30+)"]

# Categorize char length based on the bins.
posts_questions_df['q_title_word_count_bin'] = pd.cut(posts_questions_df['q_title_word_count'], q_title_word_count_bins, labels=q_title_word_count_group_names)

# Bin by char length
q_word_count_explore = posts_questions_df.groupby('q_title_word_count_bin').count()
q_word_count_explore

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin
q_title_word_count_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Short (0 - 10),279318,279318,279318,279318,279318,279318,279318,279305,279318,279318,279318
Medium (10-20),161428,161428,161428,161428,161428,161428,161428,161419,161428,161428,161428
Long (20-30),8796,8796,8796,8796,8796,8796,8796,8796,8796,8796,8796
XL (30+),63,63,63,63,63,63,63,63,63,63,63


In [26]:
# Explore view count
q_views_explore_raw = posts_questions_df.groupby('q_view_count').count()
q_views_explore_raw

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin
q_view_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,3,3,3,3,3,3,3,3,3,3,3
6,8,8,8,8,8,8,8,8,8,8,8
7,25,25,25,25,25,25,25,25,25,25,25
8,59,59,59,59,59,59,59,59,59,59,59
9,119,119,119,119,119,119,119,119,119,119,119
...,...,...,...,...,...,...,...,...,...,...,...
57402,1,1,1,1,1,1,1,1,1,1,1
65954,1,1,1,1,1,1,1,1,1,1,1
93510,1,1,1,1,1,1,0,1,1,1,1
95253,1,1,1,1,1,1,0,1,1,1,1


In [27]:
q_views_explore_raw.sort_values(by=['q_title']).tail(20) # Majority views around mid 20s to 30s

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin
q_view_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
45,7026,7026,7026,7026,7026,7026,7026,7026,7026,7026,7026
43,7379,7379,7379,7379,7379,7379,7379,7379,7379,7379,7379
42,7409,7409,7409,7409,7409,7409,7409,7409,7409,7409,7409
25,7481,7481,7481,7481,7481,7481,7481,7481,7481,7481,7481
41,7707,7707,7707,7707,7707,7707,7707,7707,7707,7707,7707
26,7738,7738,7738,7738,7738,7738,7738,7738,7738,7738,7738
40,8044,8044,8044,8044,8044,8044,8044,8044,8044,8044,8044
39,8141,8141,8141,8141,8141,8141,8141,8141,8141,8141,8141
27,8255,8255,8255,8255,8255,8255,8255,8255,8255,8255,8255
28,8319,8319,8319,8319,8319,8319,8319,8319,8319,8319,8319


In [28]:
# Bin view count for more meaningful analysis
# Establish the bins.
q_view_count_bins = [0,  50, 500, 1000, 5000, 16000]
q_view_count_group_names = ["<50", "50-500", "500-1000", "1000-5000", "5000-16000"]

# Categorize score based on the bins.
posts_questions_df['q_view_count_bin'] = pd.cut(posts_questions_df['q_view_count'], q_view_count_bins, labels=q_view_count_group_names)

In [29]:
# Explore view count
q_views_explore = posts_questions_df.groupby('q_view_count_bin').count()
q_views_explore

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin
q_view_count_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
<50,250328,250328,250328,250328,250328,250328,250328,250328,250328,250328,250328,250328
50-500,188280,188280,188280,188280,188280,188280,188280,188271,188280,188280,188280,188280
500-1000,6733,6733,6733,6733,6733,6733,6733,6733,6733,6733,6733,6733
1000-5000,3814,3814,3814,3814,3814,3814,3814,3812,3814,3814,3814,3814
5000-16000,377,377,377,377,377,377,377,374,377,377,377,377


In [30]:
# Readjust bins for more distributed data - add meaningfulness to ML model inputs

# Establish the bins.
q_view_count_bins = [0, 10, 20, 30, 40, 50, 16000]
q_view_count_group_names = ["<10", "10-20", "20-30","30-40", "40-50", "50-16000"]

# Categorize score based on the bins.
posts_questions_df['q_view_count_bin'] = pd.cut(posts_questions_df['q_view_count'], q_view_count_bins, labels=q_view_count_group_names)
# Explore view count
q_views_explore = posts_questions_df.groupby('q_view_count_bin').count()
q_views_explore

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin
q_view_count_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
<10,452,452,452,452,452,452,452,452,452,452,452,452
10-20,22517,22517,22517,22517,22517,22517,22517,22517,22517,22517,22517,22517
20-30,73935,73935,73935,73935,73935,73935,73935,73935,73935,73935,73935,73935
30-40,86251,86251,86251,86251,86251,86251,86251,86251,86251,86251,86251,86251
40-50,67173,67173,67173,67173,67173,67173,67173,67173,67173,67173,67173,67173
50-16000,199204,199204,199204,199204,199204,199204,199204,199190,199204,199204,199204,199204


In [31]:
# Determine word count of body
posts_questions_df['q_body_word_count']= posts_questions_df['q_body'].str.split().str.len() #word count
# Group by body word count
q_body_explore_raw = posts_questions_df.groupby('q_body_word_count').count()
q_body_explore_raw

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin,q_view_count_bin
q_body_word_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,1,1,1
4,2,2,2,2,2,2,2,2,2,2,2,2,2
5,5,5,5,5,5,5,5,5,5,5,5,5,5
6,6,6,6,6,6,6,6,6,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6149,1,1,1,1,1,1,1,1,1,1,1,1,1
6201,1,1,1,1,1,1,1,1,1,1,1,1,1
6712,1,1,1,1,1,1,1,1,1,1,1,1,1
7435,1,1,1,1,1,1,1,1,1,1,1,1,1


In [32]:
q_body_explore_raw.sort_values(by=['q_title']).tail(20) # Majority views around mid 20s to 30s

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin,q_view_count_bin
q_body_word_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
99,2314,2314,2314,2314,2314,2314,2314,2314,2314,2314,2314,2314,2313
104,2326,2326,2326,2326,2326,2326,2326,2326,2326,2326,2326,2326,2326
82,2326,2326,2326,2326,2326,2326,2326,2325,2326,2326,2326,2326,2326
101,2335,2335,2335,2335,2335,2335,2335,2334,2335,2335,2335,2335,2334
106,2336,2336,2336,2336,2336,2336,2336,2336,2336,2336,2336,2336,2336
94,2336,2336,2336,2336,2336,2336,2336,2335,2336,2336,2336,2336,2335
76,2339,2339,2339,2339,2339,2339,2339,2339,2339,2339,2339,2339,2339
80,2342,2342,2342,2342,2342,2342,2342,2342,2342,2342,2342,2342,2342
111,2344,2344,2344,2344,2344,2344,2344,2344,2344,2344,2344,2344,2343
83,2348,2348,2348,2348,2348,2348,2348,2347,2348,2348,2348,2348,2347


In [33]:
# Bin body word count for more meaningful analysis
# Establish the bins.
q_body_len_bins = [0,  100, 500, 10000]
q_body_len_group_names = ["<100", "100-500", "500-10000"]

# Categorize body length based on the bins.
posts_questions_df['q_body_len_bin'] = pd.cut(posts_questions_df['q_body_word_count'], q_body_len_bins, labels=q_body_len_group_names)

# Explore body length
q_body_len_explore = posts_questions_df.groupby('q_body_len_bin').count()
q_body_len_explore

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin,q_view_count_bin,q_body_word_count
q_body_len_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
<100,141210,141210,141210,141210,141210,141210,141210,141196,141210,141210,141210,141210,141182,141210
100-500,290518,290518,290518,290518,290518,290518,290518,290510,290518,290518,290518,290518,290475,290518
500-10000,17877,17877,17877,17877,17877,17877,17877,17877,17877,17877,17877,17877,17875,17877


In [34]:
# Readjust bins for more distributed data - add meaningfulness to ML model inputs

# Establish the bins.
q_body_len_bins = [0, 50, 100, 250, 500, 10000]
q_body_len_group_names = ["<50", "50-100", "100-250", "250-500", "500-10000"]

# Categorize body length based on the bins.
posts_questions_df['q_body_len_bin'] = pd.cut(posts_questions_df['q_body_word_count'], q_body_len_bins, labels=q_body_len_group_names)

# Explore body length
q_body_len_explore = posts_questions_df.groupby('q_body_len_bin').count()
q_body_len_explore

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin,q_view_count_bin,q_body_word_count
q_body_len_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
<50,32509,32509,32509,32509,32509,32509,32509,32504,32509,32509,32509,32509,32502,32509
50-100,108701,108701,108701,108701,108701,108701,108701,108692,108701,108701,108701,108701,108680,108701
100-250,218817,218817,218817,218817,218817,218817,218817,218812,218817,218817,218817,218817,218780,218817
250-500,71701,71701,71701,71701,71701,71701,71701,71698,71701,71701,71701,71701,71695,71701
500-10000,17877,17877,17877,17877,17877,17877,17877,17877,17877,17877,17877,17877,17875,17877


In [35]:
q_tags = posts_questions_df['q_tags']
q_tags # Observe the list of tags

q_id
65526420                          r|data.table|rowwise
65526423                              javascript|d3.js
65526490                                      c|macros
65526419                       c++|std|c++20|allocator
65526523         reactjs|react-router|react-router-dom
                               ...                    
69060773                           android-progressbar
69060793                          bash|virtual-machine
69060850                                 intellij-idea
69060801    python|pandas|dataframe|time-series|cumsum
69060411                    php|file|validation|exists
Name: q_tags, Length: 449605, dtype: object

In [36]:
q_tags_count = q_tags.str.count('\\|') + 1 # Count the number of | + 1 to get the number of tags in each row

posts_questions_df['q_tags_count'] = q_tags_count # Append the list as a column to the posts_questions_df 
tag_count = posts_questions_df[['q_tags','q_tags_count']]
tag_count

Unnamed: 0_level_0,q_tags,q_tags_count
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1
65526420,r|data.table|rowwise,3
65526423,javascript|d3.js,2
65526490,c|macros,2
65526419,c++|std|c++20|allocator,4
65526523,reactjs|react-router|react-router-dom,3
...,...,...
69060773,android-progressbar,1
69060793,bash|virtual-machine,2
69060850,intellij-idea,1
69060801,python|pandas|dataframe|time-series|cumsum,5


In [37]:
# Group by tag count
tag_count_exlore = tag_count.groupby('q_tags_count').count()
tag_count_exlore

Unnamed: 0_level_0,q_tags
q_tags_count,Unnamed: 1_level_1
1,53181
2,117262
3,125491
4,86881
5,66790


In [38]:
# Add column for day of question_creation_date [question_day]
posts_questions_df['q_day'] = posts_questions_df['q_creation_date'].dt.day_name()
posts_questions_df.head()

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin,q_view_count_bin,q_body_word_count,q_body_len_bin,q_tags_count,q_day
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
65526420,How to check if values in individiual rows of ...,<p>Suppose I have the following data.table:</p...,65526457,2021-01-01 00:05:46.310000+00:00,2,r|data.table|rowwise,62,Positive Score (>0),72,13,Medium (50-100),Medium (10-20),50-16000,116,100-250,3,Friday
65526423,d3.js : Generating axis ticks for ordinal values,<p>I want to use ordinal scale in x-axis with ...,65526533,2021-01-01 00:06:09.007000+00:00,2,javascript|d3.js,48,Positive Score (>0),48,8,Short (0 - 50),Short (0 - 10),40-50,58,50-100,2,Friday
65526490,Is there a C macro that replaces varied length...,<p>I want to be able to:</p>\n<pre><code>#defi...,65526541,2021-01-01 00:20:09.553000+00:00,2,c|macros,35,Positive Score (>0),81,13,Medium (50-100),Medium (10-20),30-40,117,100-250,2,Friday
65526419,How can I construct my objects allocated throu...,<p>C++20 removed the <code>construct()</code> ...,65526554,2021-01-01 00:05:43.627000+00:00,3,c++|std|c++20|allocator,351,Positive Score (>0),76,9,Medium (50-100),Short (0 - 10),50-16000,50,<50,4,Friday
65526523,Navigate from parent to child in react where e...,<p>In a React app with a parent and child elem...,65526577,2021-01-01 00:30:31.933000+00:00,2,reactjs|react-router|react-router-dom,117,Positive Score (>0),82,14,Medium (50-100),Medium (10-20),50-16000,305,250-500,3,Friday


In [39]:
# Add column for hour value of question_creation_date [question_time]
posts_questions_df['q_hour'] = posts_questions_df['q_creation_date'].dt.hour

posts_questions_df

Unnamed: 0_level_0,q_title,q_body,accepted_answer_id,q_creation_date,q_score,q_tags,q_view_count,q_score_tier,q_title_char_count,q_title_word_count,q_title_char_count_bin,q_title_word_count_bin,q_view_count_bin,q_body_word_count,q_body_len_bin,q_tags_count,q_day,q_hour
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
65526420,How to check if values in individiual rows of ...,<p>Suppose I have the following data.table:</p...,65526457,2021-01-01 00:05:46.310000+00:00,2,r|data.table|rowwise,62,Positive Score (>0),72,13,Medium (50-100),Medium (10-20),50-16000,116,100-250,3,Friday,0
65526423,d3.js : Generating axis ticks for ordinal values,<p>I want to use ordinal scale in x-axis with ...,65526533,2021-01-01 00:06:09.007000+00:00,2,javascript|d3.js,48,Positive Score (>0),48,8,Short (0 - 50),Short (0 - 10),40-50,58,50-100,2,Friday,0
65526490,Is there a C macro that replaces varied length...,<p>I want to be able to:</p>\n<pre><code>#defi...,65526541,2021-01-01 00:20:09.553000+00:00,2,c|macros,35,Positive Score (>0),81,13,Medium (50-100),Medium (10-20),30-40,117,100-250,2,Friday,0
65526419,How can I construct my objects allocated throu...,<p>C++20 removed the <code>construct()</code> ...,65526554,2021-01-01 00:05:43.627000+00:00,3,c++|std|c++20|allocator,351,Positive Score (>0),76,9,Medium (50-100),Short (0 - 10),50-16000,50,<50,4,Friday,0
65526523,Navigate from parent to child in react where e...,<p>In a React app with a parent and child elem...,65526577,2021-01-01 00:30:31.933000+00:00,2,reactjs|react-router|react-router-dom,117,Positive Score (>0),82,14,Medium (50-100),Medium (10-20),50-16000,305,250-500,3,Friday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69060773,android studio: center ProgressPar on top of R...,<p>I'm working on a chat app where users can s...,69060817,2021-09-05 05:31:01.447000+00:00,0,android-progressbar,7,Zero Score (0),61,8,Medium (50-100),Short (0 - 10),<10,269,250-500,1,Sunday,5
69060793,"bash command for hostname only returns ""server...",<p>I'm very new to bash scripting and I'm stuc...,69060824,2021-09-05 05:34:58.063000+00:00,1,bash|virtual-machine,13,Positive Score (>0),94,14,Medium (50-100),Medium (10-20),10-20,164,100-250,2,Sunday,5
69060850,Intellij how to delete all line containing ann...,<p>I want to delete all lines that contains an...,69060862,2021-09-05 05:45:41.713000+00:00,1,intellij-idea,11,Positive Score (>0),54,8,Medium (50-100),Short (0 - 10),10-20,46,<50,1,Sunday,5
69060801,Getting sum data for smoothly shifting groups ...,<p>I have a time series data of the following ...,69060898,2021-09-05 05:36:53.920000+00:00,2,python|pandas|dataframe|time-series|cumsum,28,Positive Score (>0),84,16,Medium (50-100),Medium (10-20),20-30,255,250-500,5,Sunday,5


In [40]:
db_string = f"postgresql://acospynamiyoti:{db_password}@ec2-44-198-223-154.compute-1.amazonaws.com:5432/d443pqekji2r98"

In [41]:
engine = create_engine(db_string)

In [42]:
# Import posts_questions_df to SQL table

posts_questions_df.to_sql(name='posts_questions', con=engine)