In [14]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from scipy import stats

pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 100)

import os
os.makedirs("../plots/posts", exist_ok=True)

In [5]:
# read the first 10000 rows of the data
posts = pd.read_csv("../data/Posts.csv", nrows=100000, delimiter='\x17')
print(posts.describe())
print(posts.head())

       AcceptedAnswerId   AnswerCount   CommentCount  FavoriteCount             Id  LastEditorUserId   OwnerUserId      ParentId     PostTypeId         Score     ViewCount
count      1.547800e+04  19584.000000  100000.000000   13431.000000  100000.000000      3.734100e+04  9.577200e+04  8.041600e+04  100000.000000  100000.00000  1.958400e+04
mean       4.164653e+05      6.657118       0.865430       0.016976   92101.616420      1.000108e+06  4.466668e+04  8.886101e+04       1.804160      27.21950  5.029548e+04
std        2.846420e+06      7.789822       2.080746       1.941515   52936.375435      2.280316e+06  2.684110e+05  1.020035e+05       0.396848     207.47704  2.097155e+05
min        7.000000e+00      0.000000       0.000000       0.000000       4.000000     -1.000000e+00  1.000000e+00  4.000000e+00       1.000000     -48.00000  7.200000e+01
25%        4.897225e+04      3.000000       0.000000       0.000000   48207.750000      2.688000e+03  1.816000e+03  4.312600e+04       2.000

In [15]:
# for each numerical column, plot a histogram
for column in posts.select_dtypes(include=[np.number]).columns:
    plt.hist(posts[column].dropna(), bins=20)
    plt.title(column)
    plt.semilogy()
    plt.savefig(f"../plots/posts/{column}.png")
    plt.close()

In [21]:
# plotting a score histogram for posttypeid 1 and 2
# bins from 0 to 20000, 20 bins
bins = np.linspace(0, 20000, 20)
plt.hist(posts[posts.PostTypeId == 1].Score, bins=bins, alpha=0.5, label="Question")
plt.hist(posts[posts.PostTypeId == 2].Score, bins=bins, alpha=0.5, label="Answer")
plt.legend()
plt.semilogy()
plt.savefig("../plots/posts/score_PostTypeId.png")
plt.close()

In [22]:
# plotting view count histogram for posttypeid 1 and 2
# bins from 0 to 5,500,000, 20 bins
bins = np.linspace(0, 5_500_000, 20)
plt.hist(posts[posts.PostTypeId == 1].ViewCount, bins=bins, alpha=0.5, label="Question")
plt.hist(posts[posts.PostTypeId == 2].ViewCount, bins=bins, alpha=0.5, label="Answer")
plt.legend()
plt.semilogy()
plt.savefig("../plots/posts/viewcount_PostTypeId.png")
plt.close()

  xmin = min(xmin, np.nanmin(xi))
  xmax = max(xmax, np.nanmax(xi))
