In [26]:
### Importing Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import math


In [6]:
#reading our .csv file 
df = pd.read_csv('PortfolioData.csv')

In [8]:
#Looking at the types of our data 
df.dtypes

UniqueID                        int64
CurrentDate                    object
AccountActivity1                int64
AccountActivity2              float64
AccountDetail1                  int64
AccountDetail2                 object
AccountDetail3                  int64
AccountDetail4                  int64
AccountDetail5                 object
AccountDetail6                 object
AccountDetail7                  int64
AccountStatus1                 object
AccountStatus2                 object
PrevAccountDetail1             object
PrevAccountStatus1             object
PrevAccountStatus2             object
AccountActivity3              float64
AccountActivity4              float64
AccountActivity5              float64
AccountActivity6              float64
AccountActivity7              float64
Balance1                      float64
Balance2                      float64
Payment1                      float64
PrevBalance1                  float64
AccountDetail8                 object
HistoricalAc

In [10]:
#looking at the shape of our data 
df.shape 

(300000, 57)

In [12]:
#looking at the info of our data 
df.info

<bound method DataFrame.info of         UniqueID CurrentDate  AccountActivity1  AccountActivity2  \
0              1   11/1/2017                 0               0.0   
1              2   11/1/2017                 0               0.0   
2              3   11/1/2017                 0               0.0   
3              4   11/1/2017                 0               0.0   
4              5   11/1/2017                 0               0.0   
...          ...         ...               ...               ...   
299995    299996   11/1/2017                 0               0.0   
299996    299997   11/1/2017                 0               0.0   
299997    299998   11/1/2017                 0               0.0   
299998    299999   11/1/2017                 0               0.0   
299999    300000   11/1/2017                 0               0.0   

        AccountDetail1 AccountDetail2  AccountDetail3  AccountDetail4  \
0                    1       5/1/2016             750               0   
1    

In [14]:
#Finding the duplicates 
df.duplicated().sum()
#we have 0 duplicates yayyy!! :)

0

In [16]:
#Finding for missing Values in our data 
df.isnull().sum()

UniqueID                           0
CurrentDate                        0
AccountActivity1                   0
AccountActivity2                   0
AccountDetail1                     0
AccountDetail2                     0
AccountDetail3                     0
AccountDetail4                     0
AccountDetail5                 15574
AccountDetail6                142393
AccountDetail7                     0
AccountStatus1                200182
AccountStatus2                106697
PrevAccountDetail1            176709
PrevAccountStatus1            287437
PrevAccountStatus2             95016
AccountActivity3               14173
AccountActivity4               14173
AccountActivity5               14173
AccountActivity6               14173
AccountActivity7               14173
Balance1                       14173
Balance2                       14173
Payment1                       14173
PrevBalance1                   14173
AccountDetail8                  1468
HistoricalAccountActivity1      1149
H

In [18]:
for i in df.columns:
    print(i,df[i].unique())

UniqueID [     1      2      3 ... 299998 299999 300000]
CurrentDate ['11/1/2017' '6/1/2017' '9/1/2017' '2/1/2017' '10/1/2017' '3/1/2017'
 '7/1/2017' '12/1/2017' '1/1/2017' '8/1/2017' '5/1/2017' '4/1/2017']
AccountActivity1 [0 2 1]
AccountActivity2 [ 0.   75.   45.   ... 14.98 43.69 18.78]
AccountDetail1 [1 2 0 3 4 5]
AccountDetail2 ['5/1/2016' '4/1/2015' '8/1/2016' '7/1/2017' '7/1/2011' '8/1/2017'
 '5/1/2015' '9/1/2016' '5/1/2012' '11/1/2014' '1/1/2013' '10/1/2017'
 '4/1/2008' '5/1/2014' '1/1/2017' '10/1/2014' '5/1/2011' '6/1/2017'
 '8/1/2011' '10/1/2015' '2/1/2017' '6/1/2014' '6/1/2013' '7/1/2007'
 '5/1/2004' '4/1/2017' '4/1/2016' '7/1/2013' '6/1/2008' '9/1/2017'
 '7/1/2016' '2/1/2014' '2/1/2016' '12/1/2015' '3/1/2017' '3/1/2015'
 '9/1/2015' '11/1/2017' '5/1/2013' '4/1/2013' '7/1/2014' '7/1/2015'
 '11/1/2013' '11/1/2012' '10/1/2016' '8/1/2015' '11/1/1998' '12/1/2011'
 '12/1/2006' '6/1/2016' '4/1/2007' '8/1/2014' '12/1/2014' '1/1/2011'
 '8/1/2013' '12/1/2012' '11/1/2015' '1/1/2014' '9

In [20]:
# List of specific columns you want to check
columns = ['HistoricalAccountActivity7', 'HistoricalAccountStatus14', 'AccountDetail5','AccountDetail6', 'PrevAccountStatus1', 'PrevAccountStatus2','AccountStatus1'  ]

# Loop through the selected columns and print their unique values
for col in columns:
    print(f"Unique values in {col}: {df[col].unique()}")


Unique values in HistoricalAccountActivity7: [ 0. nan]
Unique values in HistoricalAccountStatus14: [ 0.  1. nan]
Unique values in AccountDetail5: [nan 'X']
Unique values in AccountDetail6: [nan 'X']
Unique values in PrevAccountStatus1: [nan 'C' 'E' 'A' 'I' 'Z' 'B' 'F']
Unique values in PrevAccountStatus2: [nan 'D' 'O' 'X' 'N']
Unique values in AccountStatus1: [nan 'O' 'N' 'D' 'X']


In [22]:
# Assuming your dataframe is named 'df'

# Checking the correlation between 'PrevBalance1' and 'Balance1'
correlation_value = df[['PrevBalance1', 'Balance1']].corr().iloc[0, 1]

# Print the correlation value
print(f"Correlation between 'PrevBalance1' and 'Balance1': {correlation_value}")



Correlation between 'PrevBalance1' and 'Balance1': 0.968929079376033


In [28]:
# Determine number of rows and columns for subplots
n_cols = 4  # Increase columns to reduce the size of individual plots
n_rows = math.ceil(len(columns_to_plot) * 2 / n_cols)

# Create a figure with subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 2.5))  # Adjusted figsize

# Flatten the axes array for easy indexing
axes = axes.flatten()

# Loop through columns and create plots
for i, col in enumerate(columns_to_plot):
    # Histogram
    axes[2 * i].hist(data[col].dropna(), bins=50)
    axes[2 * i].set_title(f'Histogram of {col}', fontsize=8)  # Reduce font size
    
    # Boxplot
    axes[2 * i + 1].boxplot(data[col].dropna(), vert=False)
    axes[2 * i + 1].set_title(f'Boxplot of {col}', fontsize=8)  # Reduce font size

    # Adjust tick parameters
    axes[2 * i].tick_params(axis='both', which='major', labelsize=7)
    axes[2 * i + 1].tick_params(axis='both', which='major', labelsize=7)

# Hide any unused subplots
for ax in axes[2 * len(columns_to_plot):]:
    ax.axis('off')

# Adjust layout for tighter spacing
plt.tight_layout()

# If necessary, reduce spacing further
fig.subplots_adjust(hspace=0.5, wspace=0.5)  # Adjust space between subplots

# Show the plot
plt.show()



NameError: name 'columns_to_plot' is not defined