In [8]:
import json
import math
import pandas as pd
import plotly.offline as offline
import plotly.express as px
from scipy.stats import pearsonr

In [3]:
df = pd.read_csv("data/request.csv")

## Correlação

In [91]:
# Only compute pearson prod-moment correlations between feature
# columns and target column
target_col_name = 'desempenho'
correlation_items = {}
for col in df:
    try:
        if target_col_name != col:
            correlation_items[col] = \
            pearsonr(df[col], df[target_col_name])[0]
    except:
        pass

In [92]:
correlation_items

{'curso': nan,
 'data_de_final': 0.12417696034464978,
 'data_de_início': 0.12403743165036897,
 'desempenho_binario': -0.8847490349149028,
 'forum01': 0.5304222780933984,
 'forum02': 0.6016672201021609,
 'forum03': 0.6034101807560337,
 'forum04': 0.6150771613229821,
 'id_da_disciplina': 0.15471321438958802,
 'id_do_aluno': -0.000968838669736322,
 'media_forum': 0.8019442745597744,
 'media_provas': 0.907983251854633,
 'media_webquest': 0.8398966874763164,
 'período': 0.14258311368571044,
 'primeira_prova': 0.751755988959008,
 'prova01': 0.6986730510337389,
 'prova01_2chamada': 0.08811592637069353,
 'prova02': 0.6516536680348715,
 'prova02_2chamada': 0.11702809062118734,
 'segunda_prova': 0.6858266190960319,
 'semestre': 0.11162639899787571,
 'var01': 0.311584911064265,
 'var02': 0.22902326843829865,
 'var03': 0.1010901975551252,
 'var04': 0.18316725122992972,
 'var05': 0.15789244603105687,
 'var06': 0.17773054180000158,
 'var07': 0.17534049335827562,
 'var08': 0.3418864340183274,
 'var09

## Análise descritiva dos dados

In [73]:
overview_items = json.loads(df.describe().to_json(force_ascii=False))

## Tipos

In [74]:
type_items = df.dtypes.apply(lambda x: x.name).to_dict()

## Unique

In [75]:
unique_items = {}

for column in df.columns:
    unique_items[column] = df[column].nunique()

## Count Null

In [76]:
null_items = df.isna().sum().apply(lambda x: x).to_dict()

In [93]:
data = []

for column in df.columns:
    corr = None
    type_column = 'Categórico'
    descriptive = {
        "count": None,
        "mean": None,
        "std": None,
        "min": None,
        "25%": None,
        "50%": None,
        "75%": None,
        "max": None
    }
    
    if column in overview_items:
        descriptive = overview_items[column]
        type_column = 'Discreto'
    
    if column in correlation_items:
        if math.isnan(correlation_items[column]) == False:
            corr = "%.2f" % correlation_items[column]
            corr = float(corr)
    
    item = {
        'indicator': column,
        'type': type_column,
        'missing': null_items[column],
        'unique': unique_items[column],
        'mean': descriptive['mean'],
        "std": descriptive['std'],
        "min": descriptive['min'],
        "25%": descriptive['25%'],
        "50%": descriptive['50%'],
        "75%": descriptive['75%'],
        "max": descriptive['max'],
        "corr": corr
    }
    
    data.append(item)

In [85]:
data

[{'indicator': 'curso',
  'type': 'Categórico',
  'missing': 0,
  'unique': 1,
  'mean': None,
  'std': None,
  'min': None,
  '25%': None,
  '50%': None,
  '75%': None,
  'max': None,
  'corr': None},
 {'indicator': 'data_de_final',
  'type': 'Discreto',
  'missing': 0,
  'unique': 7,
  'mean': 1361812869.3824344,
  'std': 64104140.89989983,
  'min': 1277856000.0,
  '25%': 1309392000.0,
  '50%': 1388361600.0,
  '75%': 1420070399.0,
  'max': 1467331199.0,
  'corr': 0.12},
 {'indicator': 'data_de_início',
  'type': 'Discreto',
  'missing': 0,
  'unique': 7,
  'mean': 1346095180.0829875,
  'std': 64026776.30076964,
  'min': 1262304000.0,
  '25%': 1293840000.0,
  '50%': 1372636800.0,
  '75%': 1404172800.0,
  'max': 1451606400.0,
  'corr': 0.12},
 {'indicator': 'desempenho',
  'type': 'Discreto',
  'missing': 0,
  'unique': 499,
  'mean': 4.2434634336,
  'std': 2.9746792903,
  'min': 0.0,
  '25%': 1.0,
  '50%': 4.86,
  '75%': 6.65,
  'max': 10.5,
  'corr': None},
 {'indicator': 'desempenho