In [19]:
import pandas as pd

# Visualización
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Preprocessing
from sklearn.preprocessing import OneHotEncoder

In [20]:
def remove_outliers_percentile(df: pd.DataFrame, quantile: float = 0.99, 
                    columns: list = None) -> pd.DataFrame:
    if columns is None:
        columns = df.columns
    
    old_shape = df.shape[0]
    print(f"Original shape: {old_shape}")
    
    limits = df[columns].quantile(quantile, interpolation='higher')
    
    print(limits)
    for col in columns:
        df = df[df[col] <= limits[col]]
        print(f"{col} removed {1-df.shape[0]/old_shape}")
        
    return df

In [21]:
def make_specs( rows : int, cols : int, n : int) -> dict:
    """Makes the list of dictionaries to set the layout for the function make_subplots, especially for primes and specific grids.

    Args:
        rows (int): Number of rows for the grid
        cols (int): Number of columns for the grid
        n (int): Total number of figures

    Returns:
        dict: Grid for plotting
    """
    specs = []
    for i in range(rows):
        auxi = []
        for j in range(cols):
            if i*cols + j <= n:
                auxi.append({})
            else:
                auxi.append(None)
        specs.append(auxi)
    return specs

In [22]:
def hist_matrix(df : pd.DataFrame, columns : list = None, rows : int = None, cols : int = None) -> None:
    """Makes a matrix of histograms for each column of a pd.DataFrame

    Args:
        df (pd.DataFrame): Data to plot
        columns (list, optional): Columns to include in the matrix. Defaults to None.
        rows (int, optional): Number of rows for the matrix. Defaults to None.
        cols (int, optional): Number of columns for the matrix. Defaults to None.
    """
    if columns == None:
        columns = df.columns
    
    n = len(columns)

    if rows == None:
        rows = int(n**0.5)
    if cols == None:
        cols = int(n/rows + 0.99)
    
    if rows*cols < n:
        cols = int(n/rows + 0.99)

    specs = make_specs(rows, cols, n)
    fig1 = make_subplots(rows = rows, cols = cols,
                        specs= specs, subplot_titles= columns)

    for i, col in enumerate(columns):
        row = i//cols +1
        colum = i%cols + 1
        fig1.add_trace(go.Histogram(x=df[col], name=col) , row=row, col=colum)

    fig1.update_layout(title_text='Histogramas', showlegend=False)
    fig1.show()

In [62]:
df = pd.read_csv('./data/anime_final.csv')

In [65]:
df.to_csv('./data/anime_final.csv', index=False)

In [25]:
df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members',
       'image_url', 'source', 'status', 'airing', 'aired_string', 'aired',
       'duration', 'classification', 'score', 'scored_by', 'rank',
       'popularity', 'favorites', 'related', 'opening_theme', 'ending_theme',
       'Synopsis'],
      dtype='object')

In [26]:
df.dtypes

anime_id            int64
name               object
genre              object
type               object
episodes           object
rating            float64
members             int64
image_url          object
source             object
status             object
airing               bool
aired_string       object
aired              object
duration           object
classification     object
score             float64
scored_by           int64
rank              float64
popularity          int64
favorites           int64
related            object
opening_theme      object
ending_theme       object
Synopsis           object
dtype: object

In [27]:
df['score'].dtype == 'float64'

True

In [28]:
cont_vars = [x for x in df.columns if df[x].dtype == 'float64' or df[x].dtype == 'int64']

In [29]:
cont_vars

['anime_id',
 'rating',
 'members',
 'score',
 'scored_by',
 'rank',
 'popularity',
 'favorites']

In [30]:
hist_matrix(df, cont_vars)

In [31]:
disc_vars = [x for x in df.columns if x not in cont_vars]

In [32]:
df[disc_vars].nunique()

name              10301
genre              2981
type                  6
episodes            178
image_url         10254
source               16
status                2
airing                2
aired_string       7493
aired              7228
duration            272
classification        6
related            7007
opening_theme      3747
ending_theme       4647
Synopsis           9893
dtype: int64

In [33]:
cat_vars = [x for x in disc_vars if df[x].nunique() < 20]

In [34]:
cat_vars

['type', 'source', 'status', 'airing', 'classification']

In [38]:
df2 = df

In [67]:
def others(df1, col):
    df1 = df1.copy()

    freq = (df1[col].value_counts(True) < 0.05)
    if sum(freq)==1:
        return df1
    df1[col] = df1[col].apply(lambda x: x if not freq[x] else 'Otros')
    return df1

In [68]:
for col in cat_vars:
    display(px.pie(others(df, col), names=col, values='anime_id', title=col))