In [None]:
!pip install -r requirements.txt

In [None]:
# Import Libraries
import os
from quickda.explore_data import *
from quickda.clean_data import *
from quickda.explore_numeric import *
from quickda.explore_categoric import *
from quickda.explore_numeric_categoric import *
from quickda.explore_time_series import *

# Data Exploration

## 1. Standardize Column Names

In [None]:
filepath = os.path.join('data','loans.csv')
df = pd.read_csv(filepath)
df = standardize_column_names(data=df)
df

## 2. Generate Data Profile Report

In [None]:
generate_data_profile_report(data=df, report_name="Loans Dataset Report", is_large_dataset=False)

## 3. Summarize Data

In [None]:
data_summary = summarize_data(data=df)
data_summary

# Data Cleaning

## 1. Drop Features

In [None]:
df = drop_features(data=df, columns_to_drop=["sub_grade"])
df.shape

## 2. Modify Feature Data Types

In [None]:
df = convert_features_dtype_to_datetime(data=df, datetime_columns=['dt_applied'])
df = convert_features_dtype_to_numeric(data=df, 
                                       numeric_columns=['annual_inc', 
                                                        'funded_amnt_inv', 
                                                        'loan_amnt', 
                                                        'revol_bal'])
df = convert_features_dtype_to_category(data=df, 
                                        dic={'add_state':False, 
                                             'delinq_2yrs':False, 
                                             'gender':False, 
                                             'grade':True,
                                             'home_ownership':False, 
                                             'inq_last_6mths':False, 
                                             'loan_writeoff':False,
                                             'pub_rec':False, 
                                             'purpose': False, 
                                             'term':False,
                                             'verification_status':False})
df.dtypes

## 3. Handle Duplicate Data

In [None]:
df = drop_duplicate_rows(data=df)
df.shape

## 4. Handle Missing Values

In [None]:
visualize_missing_data(data=df)

In [None]:
df = fill_na_rows(data=df, na_columns=['name','email_id','gender','university'])
visualize_missing_data(data=df)

# EDA - Numerical Features

## 1. Plot Outliers - Boxplots

In [None]:
boxplot_of_numerical_features(data=df)

## 2. Plot Distributions - Histograms

In [None]:
histogram_of_numerical_features(data=df, num_of_bins=10)

## 3. Plot Correlations - Heatmap

In [None]:
get_correlation_between_numerical_features(data=df)

# EDA - Categorical Features

## 1. Plot Frequency - Bar Chart

In [None]:
barchart_of_categorical_features(data=df, cat_column='purpose')

In [None]:
barchart_of_categorical_features(data=df, cat_column='purpose', cat_column_2='gender')

## 2. Summarize Frequency Table

In [None]:
get_summary_of_categorical_feature(data=df, cat_column='purpose')

In [None]:
get_summary_of_categorical_feature(data=df, cat_column='purpose', cat_column_2='gender')

In [None]:
get_summary_of_categorical_feature(df, cat_column='gender', cat_column_2='purpose')

# EDA - Numerical/Categorical Features

## 1. Plot Predictive Power Score (PPS) - Heatmap

In [None]:
find_predictive_power_score(data=df)

## 2. Plot Relationships - Scatterplot

In [None]:
scatterplot_between_categorical_features(data=df, 
                                         x_num_column='installment', 
                                         y_num_column='loan_amnt', 
                                         cat_column='grade')

## 3. Plot Categorical Feature across Numerical Feature - Violin Plots

In [None]:
violinplot_of_categorical_with_numerical_feature(data=df, cat_column='grade', num_column='loan_amnt')

## 4. Pivot Views

In [None]:
pivot_data(df, 
           cat_columns_1=['grade','gender'], 
           agg_num_columns=['loan_amnt'],
           cat_columns_2=None,  
           agg_method='median')

# EDA - Time Series Data

In [None]:
plot_time_series_distribution(data=df, datetime_column='dt_applied', target_column='annual_inc')