In [18]:
import os
import pandas as pd
from pathlib import Path

PATH_TO_DATA = '../../Source/data/'

## Main functions to manipulate a dataframe

### For testing purposes of data manipulation functions

In [47]:
def get_files_in_directory() -> list:
    """
    Reads JSON files in a set directory.
    Returns a list of names of files in the directory
    to be iterated through.
    :return a list of file names in the directory
    """
    files_in_path = os.scandir(PATH_TO_DATA)

    list_of_files = []
    for file in files_in_path:
        if file.is_dir() or file.is_file() and file.name.endswith('.csv'):
            list_of_files.append(file)
    return list_of_files

In [48]:
def create_dataframe(csv_file: str) -> pd.DataFrame:
    """
    Creates a pandas dataframe from a JSON file.
    Requires a name of the file.
    """
    
    path_to_file = os.path.join(PATH_TO_DATA, csv_file)
    dataframe = pd.read_csv(path_to_file)
    
    return dataframe

In [49]:
def inspect_dataframe(dataframe: pd.DataFrame):
    """
    Returns information about the dataset 
    and shows a preview of the data in it.
    """
    print(dataframe.info(), '\n')
    print(dataframe.head())

## Finding the longest value in each column of a given dataframe

In [11]:
def longest_value(dataframe: pd.DataFrame):
    """
    Checking for the longest value in each column of a given dataframe.
    """
    for row in dataframe:
        col_name = row

        if dataframe[row].dtype == float or int:
            list_len = dataframe[row].astype(str).str.len().max()

        else:
            list_len = max(list(map(len, dataframe[row].values)))

        print('Column:', col_name)
        print('Value length:', list_len, '\n')

## Testing outputs of functions

In [50]:
csv_files = get_files_in_directory()
print(csv_files)

[<DirEntry 'loan-test.csv'>, <DirEntry 'loan-train.csv'>]


In [51]:
for csvfile in csv_files:
    df = create_dataframe(csvfile)
    df.head()

In [54]:
df1 = create_dataframe(csv_files[0])
df1.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [55]:
df2 = create_dataframe(csv_files[1])
df2.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
