In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

# Have a Bird Eye View
* read articles https://www.visual-design.net/post/semi-automated-exploratory-data-analysis-process-in-python for a comprehensive explanation
* info() and describe ()

In [None]:
df = pd.read_csv('../input/medium-articles-dataset/medium_data.csv')
df.head(5)

In [None]:
df.info()

In [None]:
df.describe(include = 'all')

# Missing Value
* isnull().sum() generates pandas Series object 
* changing Series datatype into DataFrame


In [None]:
df.isnull().sum()

In [None]:
missing_count = df.isnull().sum() # the count of missing values
value_count = df.isnull().count() # the count of all values 
missing_percentage = round(missing_count / value_count * 100,2) #the percentage of missing values
missing_df = pd.DataFrame({'count': missing_count, 'percentage': missing_percentage}) #create a dataframe
print(missing_df)

In [None]:
barchart = missing_df.plot.bar(y='percentage')

for index, percentage in enumerate(missing_percentage):
    barchart.text(index, percentage, str(percentage) + '%' )

# Feature Engineering & Feature Selection
* date -> extract year and month
* title -> extract length
* subtitle -> whether contains subtitle or not

In [None]:
# adding title_length
df['title_length'] = df['title'].apply(len)

# extracting month from date
df['month'] = pd.to_datetime(df['date']).dt.month.apply(str)

# whether the article has subtitle
df['with_subtitle'] = np.where(df['subtitle'].isnull(), 'Yes', 'No')

In [None]:
# populate the list of numeric attributes and categorical attributes
df = df.drop(['id', 'subtitle', 'title', 'url', 'date', 'image', 'responses'], axis=1)

num_list = []
cat_list = []

for column in df:
    if is_numeric_dtype(df[column]):
        num_list.append(column)
    elif is_string_dtype(df[column]):
        cat_list.append(column)
        

print(num_list)
print(cat_list)

# Univariate Analysis

* numeric variables: histogram
* categorical variables: bar chart
 
 we can surely carry out some feature engineering before 

In [None]:
for column in df:
    plt.figure(column, figsize = (4.9,4.9))
    plt.title(column)
    if is_numeric_dtype(df[column]):
        df[column].plot(kind = 'hist')
    elif is_string_dtype(df[column]):
        # show only the TOP 10 value count in each categorical data
        df[column].value_counts()[:10].plot(kind = 'bar')

# Multivariate Analysis
* numeric variables: correlation and pairplot
* categorical & categorical: grouped bar chart
* numeric & categorical: box plot and pairplot with hue

In [None]:
# correation matrix and heatmap
correlation = df.corr()
sns.heatmap(correlation, cmap = "GnBu", annot = True)

In [None]:
# pairplot
sns.pairplot(df,height = 2.5)

In [None]:
# grouped bar chart
for i in range(0, len(cat_list)):
    primary_cat = cat_list[i]
    for j in range(0, len(cat_list)):
        secondary_cat = cat_list[j]
        if secondary_cat != primary_cat:
            plt.figure (figsize = (15,15))
            chart = sns.countplot(
                data = df,
                x= primary_cat, 
                hue= secondary_cat,
                palette = 'GnBu',
                order=df[primary_cat].value_counts().iloc[:10].index #show only TOP10
            )

In [None]:
# pairplot with hue
for i in range(0, len(cat_list)):
    hue_cat = cat_list[i]
    sns.pairplot(df, hue = hue_cat)

In [None]:
# box plot
for i in range(0, len(cat_list)):
    cat = cat_list[i]
    for j in range(0, len(num_list)):
        num = num_list[j]
        plt.figure (figsize = (15,15))
        sns.boxplot( x = cat, y = num, data = df, palette = "GnBu")