# 2. Data Analysis

In [None]:
# The aim of this script is to create a dataset with the following information:
#   - Name of the article
#   - Content of the article
#   - Category of the article

import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
sns.set_style("whitegrid")
import altair as alt
alt.renderers.enable("notebook")
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_path = 'Data/News_dataset.csv'
df = pd.read_csv(df_path,sep=';')
df.head()

In [None]:
df['id'] = 1
df2 = pd.DataFrame(df.groupby('Category').count()['id']).reset_index()

bars = alt.Chart(df2).mark_bar(size=50).encode(
    x=alt.X('Category'),
    y=alt.Y('PercentOfTotal:Q', axis=alt.Axis(format='.0%', title='% of Articles')),
    color='Category'
).transform_window(
    TotalArticles='sum(id)',
    frame=[None, None]
).transform_calculate(
    PercentOfTotal="datum.id / datum.TotalArticles"
)

text = bars.mark_text(
    align='center',
    baseline='bottom',
    #dx=5  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text=alt.Text('PercentOfTotal:Q', format='.1%')
)

(bars + text).interactive().properties(
    height=300, 
    width=700,
    title = "% of articles in each category",
)

**Catatan:** Grafik di atas menandakan bahwa proporsi jumlah artikel pada setiap category bisa dikatakan "balance".

In [None]:
df['News_length'] = df['Content'].str.len()

plt.figure(figsize=(12.8,6))
sns.distplot(df['News_length']).set_title('Distribusi Panjang Berita');

**Catatan:** Terlihat bahwa distribusi panjang konten berita terlihat "skewed". Ada "sedikit" artikel dengan panjang yang jauh lebih besar dibandingkan rata-rata panjang artikel.

In [None]:
df['News_length'].describe()

In [None]:
len(df)

In [None]:
quantile_95 = df['News_length'].quantile(0.95)
df_95 = df[df['News_length'] < quantile_95]
len(df_95)

In [None]:
plt.figure(figsize=(12.8,6))
sns.distplot(df_95['News_length']).set_title('Distribusi Panjang Berita');

In [None]:
df_more10k = df[df['News_length'] > 10000]
len(df_more10k)

In [None]:
df_more10k['Content'].iloc[0]

In [None]:
plt.figure(figsize=(12.8,6))
sns.boxplot(data=df, x='Category', y='News_length', width=.5);

In [None]:
plt.figure(figsize=(12.8,6))
sns.boxplot(data=df_95, x='Category', y='News_length');

**Catatan:** Walaupun kita lihat bahwa grafid dari df_95 lebih baik tanpa outliers, namun kali ini kita akan gunakan semua data untuk menguji sistem yang akan kita buat.

In [None]:
with open('Data/News_dataset.pickle', 'wb') as output:
    pickle.dump(df, output)