# Title `{To be changed}`

## Setup and import libraries

In [None]:
# Automatically reloading imported modules
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../..')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.helpers import *

pd.set_option('display.max_columns', None)

In [None]:
# Change design of plots
sns.set(style="whitegrid")

# Change sizes and resolution of plots
plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [None]:
df = pd.read_csv('example/data.csv')

## General descriptive analysis

Let's check shape of the data - number of rows and attributes:

In [None]:
df.shape

Overview of the data:

In [None]:
df.head()

### Datatypes

**Note:** Be careful, attributes with only NaN values are considered as `float64` type by default.

In [None]:
df.dtypes

### Basic characteristics

In [None]:
df.describe()

In [None]:
df.describe(exclude=[np.number])

### One-value columns

Which attributes contain only one value?

In [None]:
one_value_attributes_analysis(df)

### Missing values

Analysis of missing values in attributes:

In [None]:
missing_values_analysis(df)

### Duplicates

Are there any duplicates?

In [None]:
df.duplicated().any()

## Attributes analysis

Analysis of all attributes:

In [None]:
skip_attributes = [
] # attributes to skip in analysis (e.g. id)

textual_attributes = [
] # attributes with text values (e.g. content of article)
textual_attributes = list(filter(lambda value: value not in skip_attributes, textual_attributes))

numerical_attributes = list(df.select_dtypes([np.number]).columns)
numerical_attributes = list(filter(lambda value: value not in textual_attributes + skip_attributes, numerical_attributes))

categorical_attributes = list(df.select_dtypes(['object', 'category', 'bool']).columns)
categorical_attributes = list(filter(lambda value: value not in textual_attributes + skip_attributes, categorical_attributes))

label_column = 'example_label' # attribute considered as "label"

### Label attribute distribution

In [None]:
df[label_column].value_counts().plot(kind='pie', title='Distribution of predicted classes');

In [None]:
df[label_column].value_counts().plot(kind='bar', title='Distribution of predicted classes');

### Numerical attributes

Analysis of numerical attributes:

In [None]:
analyse_numerical_attributes(df, label_column, numerical_attributes)

### Categorical attributes

Analysis of categorical attributes:

In [None]:
analyse_categorical_attributes(df, label_column, categorical_attributes)

### Textual attributes

Some parts of analysis include preprocessing text. In this case, the following operations are performed:
* removing special characters (only letters are preserved),
* removing tokens shorter than 3 characters,
* removing tokens that are in english stop-words defined by NLTK library,
* removing accent marks from tokens.

Analysis of textual attributes:

In [None]:
analyse_textual_attributes(df, textual_attributes)

## Pairwise analysis

Pairwise analysis of attributes (numerical attributes):

### Pair analysis

In [None]:
if numerical_attributes and len(numerical_attributes) > 1:
    sns.pairplot(df, vars=numerical_attributes, hue=label_column);

### Correlations

Correlation matrix:

In [None]:
if numerical_attributes and len(numerical_attributes) > 1:
    check_correlations(df, numerical_attributes)