# PROYEKSI PENDUDUK MENURUT PROVINSI 2025

# Chapter 2 : Data and Sampling Distributions

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats import power
from itertools import permutations
from sklearn.utils import resample
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import pylab
import matplotlib.pylab as plt

%matplotlib inline

In [None]:
DATA = Path('.').resolve().parents[1] / 'data'
TUGAS_CSV = DATA / 'tugas.csv'

In [None]:
data = pd.read_csv(TUGAS_CSV)

In [None]:
print (data.head(41))

# Distribusi Sampling

In [None]:
np.random.seed(seed=1)
x = np.linspace(-3, 3, 300)
xsample = stats.norm.rvs(size=1000)

fig, axes = plt.subplots(ncols=2, figsize=(5, 1.5))

ax = axes[0]
ax.fill(x, stats.norm.pdf(x))
ax.set_axis_off()
ax.set_xlim(-3, 3)

ax = axes[1]
ax.hist(xsample, bins=30)
ax.set_axis_off()
ax.set_xlim(-3, 3)
ax.set_position
# plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0)

plt.show()

In [None]:
samples = data.sample(n=32)
#rata-rata dari 5 sample
sample_mean_5= samples.rolling(5).mean().jumlah
sample_mean_5= sample_mean_5.dropna()

#rata-rata dari 20 sample
sample_mean_20 = samples.rolling(20).mean().jumlah
sample_mean_20 = sample_mean_20.dropna()

fig = plt.figure(figsize=(10,10))
#distribusi penuh dari plot
ax=fig.add_subplot (2,2,1)
ax.hist(data.jumlah,bins=25)
plt.title('full plot dari keseluruhan populasi')

#distribusi dari 32 samples
ax=fig.add_subplot (2,2,2)
ax.hist(samples.jumlah,bins=32)
plt.title('full plot dari 32 samples')

#distribusi dari 10 samples
ax=fig.add_subplot (2,2,3)
ax.hist(sample_mean_5,bins=32)
plt.title('plot dari rolling data mean 5 dari 32 samples')

#distribusi dari 20 samples
ax=fig.add_subplot (2,2,4)
ax.hist(sample_mean_20,bins=32)
plt.title('plot dari rolling data mean 20 dari 32 samples')

plt.show()

# Bootstrap

In [None]:
sample_data=data['jumlah'].sample(n=32)

In [None]:
#status populasi
print('STD dari populasi total :',data.jumlah.std())
print('rata-rata populasi :',data.jumlah.mean())
print('median populasi :',np.median(data.jumlah))
print('standar eror populasi :',stats.sem(data.jumlah, ddof=1))

In [None]:
#bootstrap data dari 5 dan 20 sample
bootstrap_5 = np.random.choice(data.jumlah, size=5, replace=True)
bootstrap_20 = np.random.choice(data.jumlah, size=20, replace=True)
arr_bootstraps = [bootstrap_5,bootstrap_20]

len(arr_bootstraps)

In [None]:
for i in range(len(arr_bootstraps)) :
    print ('bootstrap stats untuk contoh sejumlah {}'.format(len(arr_bootstraps[i])))
    print ('-------------------------------------------------------')
    print ('Bootstrap std: ',np.std(arr_bootstraps[i],ddof=1))
    print ('Rata-rata Bootstrap: ',np.mean(arr_bootstraps[i]))
    print ('Median Bootstrap: ',np.median(arr_bootstraps[i]))
    print ('Bias Bootstrap: ',data.jumlah.mean()-np.mean(arr_bootstraps[i]))
    print ('standar error: ',stats.sem(arr_bootstraps[i],ddof=1))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')


In [None]:
results = []
for nrepeat in range (32) :
    sample=resample(data.jumlah)
    results.append(sample.median())
results=pd.Series(results)
print ('Statistik Bootstrap: ')
print (f'nilai asli: {data.jumlah.median()}')
print (f'bias: {results.mean() - data.jumlah.median()}')
print (f'std. error: {results.std()}')
    

# Confidence Intervals

In [None]:
print ('rata-rata data: ',data.jumlah.mean())
np.random.seed(seed=3)

#buat sample dari 20 data
sample_20=resample(data.jumlah,n_samples=20,replace=False)
print('rata-rata sample: ',sample_20.mean())
results=[]
for nrepeat in range (32) :
    sample=resample(sample_20)
    results.append(sample.mean())
results=pd.Series(results)

confidence_interval = list(results.quantile([0.05, 0.95]))
ax = results.plot.hist(bins=32, figsize=(4, 3))
ax.plot(confidence_interval, [55, 55], color='black')
for x in confidence_interval:
    ax.plot([x, x], [0, 20], color='black')
    ax.text(x, 25, f'{x:.0f}', 
            horizontalalignment='center', verticalalignment='center')
ax.text(sum(confidence_interval) / 2, 25, '90% interval',
        horizontalalignment='center', verticalalignment='center')

meanjumlah = results.mean()
ax.plot([meanjumlah, meanjumlah], [0, 20], color='black', linestyle='--')
ax.text(meanjumlah, 10, f'Mean: {meanjumlah:.0f}',
        bbox=dict(facecolor='white', edgecolor='white', alpha=0.5),
        horizontalalignment='center', verticalalignment='center')
ax.set_ylim(0, 30)
ax.set_ylabel('Counts')

plt.tight_layout()
plt.show()

# Normal Distribution

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))

norm_sample = stats.norm.rvs(size=20)
stats.probplot(norm_sample, plot=ax)

plt.tight_layout()
plt.show()

In [None]:
proyeksi_px = pd.read_csv(TUGAS_CSV)

jumlah = proyeksi_px.jumlah
jumlah = np.diff(np.log(jumlah[jumlah>0]))

fig, ax = plt.subplots(figsize=(4, 4))
stats.probplot(jumlah, plot=ax)

plt.tight_layout()
plt.show()

# Chapter 3 : Statistial Experiments and Significance Testing