In [None]:
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
from math import ceil

In [None]:
# Optional -> to check all the rows and columns
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
data = pd.read_csv('donors.csv')
data.head()

In [None]:
# Unnamed: 0 works as an index so we can drop it
data.drop('Unnamed: 0', axis = 1, inplace  = True)
data.head()

In [None]:
# Dimension of the dataset
data.shape

In [None]:
# Data types of the variables and columns
data.dtypes

In [None]:
# Summary Statistics of our variables
data.describe(include='all').T

In [None]:
# Check duplicates
any(data.duplicated())

In [None]:
# Count of missing values
data.isna().sum()

In [None]:
# Define metric and non 
# RETIRAR DATAS

non_metric_vars = data.loc[:,(data.columns=="id") | (np.array(data.dtypes=="object"))]
metric_vars = data.loc[:,(~data.columns.isin(non_metric_vars)) | (np.array(data.dtypes !="datetime"))]

In [None]:
# Metric Variables Histograms 
sb.set()

# Create individual axes 
fig, axes = plt.subplots(ceil(len(metric_var)/2), 2, figsize=(20, 900))
plt.subplots_adjust(hspace = 0.18)

# Plot data
for ax, feat in zip(axes.flatten(), metric_var):
    ax.hist(data[feat])
    ax.set_title(feat, y = -0.16)
    
plt.show()

In [None]:
# Non Metric Variables Histograms
sb.set()

# Prepare figure. Create individual axes where each histogram will be placed
fig, axes = plt.subplots(ceil(len(non_metric_var) / 2), 1, figsize=(20, 100))
plt.subplots_adjust(hspace = 0.18)

# Plot data
for ax, feat in zip(axes.flatten(), non_metric_var):
    sb.countplot(x=data[feat],  ax=ax)

plt.show()

In [None]:
# Metric Variables Boxplots
sb.set(style="whitegrid")

#Prepare figure layout
fig, axes = plt.subplots(len(metric_vars), 1, figsize=(15,8))

# Draw the boxplots
for i in zip(axes, metric_vars):
 sb.boxplot(x="variable", y="value", data=data.loc[data["variable"]==i[1]], ax=i[0])
 i[0].set_xlabel("")
 i[0].set_ylabel("")

# Finalize the plot
plt.suptitle("Metric variables' box plots", fontsize=25)
sb.despine(bottom=True)
plt.show()

In [None]:
# Metric Variables Correlation Matrix
sb.set(style="white")

# Compute the correlation matrix
corr = metric_vars.corr() #Getting correlation of numerical variables

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool) #Return an array of zeros (Falses) with the same shape and type as a given array
mask[np.triu_indices_from(mask)] = True #The upper-triangle array is now composed by True values

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(12, 8))

# Generate a custom diverging colormap
cmap = sb.diverging_palette(220, 10, as_cmap=True) #Make a diverging palette between two HUSL colors. Return a matplotlib colormap object.

# Draw the heatmap with the mask and correct aspect ratio
sb.heatmap(corr, mask=mask, cmap=cmap, center=0, square=True, linewidths=.5, ax=ax)

# Layout
plt.subplots_adjust(top=0.95)
plt.suptitle("Correlation matrix", fontsize=25)
plt.yticks(rotation=0)

# Fixing the bug of partially cut-off bottom and top cells
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.show()

In [None]:
# Metric Variables Correlation Matrix
def color_red_or_green(val):
    if val < -0.7:
        color = 'background-color: red'
    elif val > 0.7:
        color = 'background-color: green'
    else:
        color = ''
    return color

# Interpreting each Principal Component
correlations = data[metric_var].corr()
correlations.style.applymap(color_red_or_green)