In [None]:
# Import libraries
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # graph
import seaborn as sns # advanced graph
import math

In [None]:
# Import data
ape = pd.read_csv('/kaggle/input/financial-data-of-french-compagnies/ape_fusion.csv')
data = pd.read_csv('/kaggle/input/financial-data-of-french-compagnies/data_kaggle.csv')
data = data.drop('Unnamed: 0', axis=1)

## Exploratory Data Analysis of data
Goal :
     Understand our data as well as possible
     
Shape Analysis:
*      rows and columns: (100 000, 88)
*      types of variables: qualitative: 12, quantitative: 76
*      The amplitudes of the values are very important. We need to use the log fonction for the graph.
*      Analysis of missing values: Lots of missing values. Some columns are practically empty.

Background Analysis:
*      Some family links are present in the columns, it would be necessary to check that these links are correct.

## Shape Analysis

In [None]:
# Missing values
col_info = pd.DataFrame(index=data.columns, columns=['type', '% mv']) # information about the columns
print('data.shape: ',data.shape) # nombres de lignes et de colonnes
col_info['type'] = [str(x) for x in data.dtypes] # type
print('\ndata.dtypes.value_counts():\n', data.dtypes.value_counts())

# Histogram of the missing values
col_info['% mv'] = data.isna().sum(axis=0)/data.shape[0]

sns.histplot(data=col_info, x='% mv', hue='type',  multiple="stack")
plt.title('% of missing value per column')
plt.ylabel('Number of column')
plt.xlabel('%')
plt.show()

The vast majority of columns are numeric columns. The filling rate of the columns is very variable. Some columns are completely empty!

In [None]:
# visualization of the missing values
dict_colors = {'float64':'blue', 'int64':'orange', 'object': 'green', 'bool':'red'}  # colors of each type

fig = plt.figure(figsize=(25,5), dpi=150)
ax = sns.heatmap(data.sample(100).isna(), cbar=False)
plt.title('Display of the values present in black', fontsize=20)

# We modify the color of the x-axis
ax.xaxis.set_visible(False)
text_kwargs = dict(rotation='vertical', fontsize=14, va='top', ha='center')
offset = 100
for x, col in zip(ax.xaxis.get_ticklocs(), data.columns):
    type_col = data.dtypes.loc[col].name # get the type of the column
    ax.text(x, offset, col, **text_kwargs, color=dict_colors.get(type_col))

## Background Analysis:

In [None]:
# Values taken by the qualitative variables
for col in data.select_dtypes(include=['object']):
    print(col, 'nunique=', data[col].nunique(), '\n',data[col].unique()[:5], '\n')

In [None]:
data.describe()

In [None]:
data.loc[:,'Total des charges d’exploitation'].hist()
plt.title('Total des charges d’exploitation')

The histogram of a column isn't interesting, because the amplitude of the data is too important. The histogram of continuous variables gives nothing, because the variables have very large amplitudes.

In [None]:
# List of the columns with the selected type (object, float, int, str)
data.dtypes[data.dtypes == 'object']

In [None]:
# Sorts columns by percentage of missing values 
data.isna().sum(axis=0).sort_values()

In [None]:
# we create the following function in order to visualize the data with high scale
def obtention_log10(val):
  """ Transform a number : 100 000 becomes 5 and -100 becomes -2 """
  if val == np.nan: return np.nan
  elif (isinstance(val, float)) or (isinstance(val, int)):
    signe, nb = np.sign(val), np.absolute(val)
    if nb < 1: return 0
    else : return math.log10(nb) * signe
  else: return val

# we apply the function on the right columns
data_log10 = pd.concat([data.iloc[:,0],data.iloc[:,1:69].applymap(obtention_log10), data.iloc[:,69:-1]], axis=1)

In [None]:
#Histogram of continuous variable
data_log10.select_dtypes(include=float).hist(bins=30, figsize=(20, 60), layout=(20,5))

We can observe the wealth of information at our disposal. Log data regularly represent normal distributions. Which is a good sign. We can also observe normal bimodal distributions.