In [None]:
import pandas as pd
import numpy as np
import dateutil.parser
import datetime

In [None]:
### To save from multiple print and display statements, below import
### will let you see multiple outputs in same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
df = pd.read_csv("../input/german-credit-data-with-risk/german_credit_data.csv", index_col=0)

### below renaming is just to make the data consistent with the one on my local
df.rename(columns = {'Checking account': 'Credit History', 'Sex': 'Gender'}, inplace=True)

### Preliminary data analysis

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
### check NaNs
df.isnull().sum()

In [None]:
feature = 'Gender'
df[feature].value_counts()

In [None]:
df[feature].unique()

In [None]:
df.nunique()

In [None]:
### show all object type columns
df.select_dtypes(include=['object']).columns

In [None]:
### shallow vs deep copy of a dataframe
### In deep copy, a new df will be created where any changes to the new df, df_deep will not be reflected in original df
df_deep=df.copy(deep=True)
df_shallow = df.copy(deep=False)

### datetime conversion

In [None]:
### Let's create a date column
### Below generates the date for 1000 periods on daily frequency
df['date'] = pd.date_range('1/1/2000', periods=1000)

In [None]:
df['date'].dtype

In [None]:
df['date'][0]

In [None]:
### As an example, if we want to create a date range at hourly frequency for 5 periods
pd.date_range('1/1/2000', freq='H', periods=5)

In [None]:
### Passing errors='ignore' will return the input date if the date does not meet the timestamp limitations
### As per https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-timestamp-limits, 
### the timestamp can be represented using 64 bit in following range: 
print("Minimum date:", pd.Timestamp.min)
print("Maximum date:", pd.Timestamp.max)

In [None]:
### Input date outside the timestamp limitation is outputted as it is, when errors = 'ignore' was passed
pd.to_datetime('2265-04-11', format='%Y-%m-%d', errors = 'ignore')

In [None]:
### Input date outside the timestamp limitation is marked as NaT, when errors = 'coerce' was passed
pd.to_datetime('2265-04-11', format='%Y-%m-%d', errors = 'coerce')

In [None]:
### takes MM/DD/YY format
dateutil.parser.parse('1/1/2000')

In [None]:
### for cases like below, specify the parameter dayfirst as True
dateutil.parser.parse('10/1/2000', dayfirst=True)

In [None]:
### As against default
dateutil.parser.parse('10/1/2000')

In [None]:
### But for case like below with ambiguous date, passing dayfirst is ignored and 13 is assumed as date
dateutil.parser.parse('10/13/2000', dayfirst=True)

In [None]:
### convert a string to datetime
datetime.datetime.strptime('18-1-2000','%d-%m-%Y')

In [None]:
### https://strftime.org/
today =datetime.datetime.today()
print("today's date:", today)
today.strftime("%d %B %Y")

In [None]:
### chnaging B (Full month name) to b(abbreviated month name) and Y(in century) to y(without century)
today.strftime("%d %b %y")

### Key operations

In [None]:
df.sort_values(by=['Age'])

In [None]:
### setting index
index_col ='date'
df = df.set_index(index_col)

In [None]:
df

In [None]:
### Now, lets shift the row by 1 
df.shift(1)

In [None]:
### Observe the difference by specifying the frequency. Here, the row corresponding to 2000-01-01
### gets eliminated and data shifts by 1 day, creating the observation at 2002-09-27
df.shift(1, freq="D")

In [None]:
df.shift(-1)

In [None]:
### filter based on index. Lets index the data based on Age and Duration
df_age_duration = df.set_index(['Age', 'Duration'])
df_age_duration.head()

In [None]:
### get level 0 index values, which corresponding to Age and filter the dataframe for Age > 30
df_age_duration[df_age_duration.index.get_level_values(0) > 30]

In [None]:
### dropping wrt rows and columns
### how = 'all' drops the rows or columns which have all NaNs

In [None]:
### if any value in a row is NA, then it is dropped
df.dropna(axis = 0, how='any')

In [None]:
### if any value in a column are NA, then it is dropped
df.dropna(axis = 1, how='any')

In [None]:
### lowercase, uppercase, capitalize the column names
[k.lower() for k in list(df.columns)]
[k.upper() for k in list(df.columns)]
[k.capitalize() for k in list(df.columns)]

### concatenating rows and columns

In [None]:
df_dup = pd.concat([df, df], axis=0)
df_dup

In [None]:
### column wise concatenation
df_age = df[['Age']]
pd.concat([df, df_age], axis=1)

### Remove duplicates 

In [None]:
df_dup.drop_duplicates()

### Now, let's groupby and count the instances per 'Gender'

In [None]:
feature = ['Gender']
df.groupby('Gender').size()

In [None]:
### multiple functions: mean, transform, var, nth(-10), nlargest etc
df.groupby(feature).mean()
df.groupby(feature).std()

In [None]:
df.groupby(feature).first()
df.groupby(feature).last()

In [None]:
df.groupby('Gender').apply(lambda x: x.nlargest(2, 'Credit amount')).reset_index(drop=True)  

In [None]:
### apply function
df.groupby(['Gender'])['Risk'].apply(np.size)

In [None]:
### selecting, dropping, renaming columns

In [None]:
df_dtypes = pd.DataFrame((df.dtypes == 'object'), columns = ['obj_type'])
selected_cols = list(df_dtypes[(df_dtypes.obj_type == True)].index)
df[selected_cols]

In [None]:
df.columns
df.rename(columns = {'Saving accounts': 'richness_quotient'}).columns

In [None]:
drop_cols = ['Saving accounts']
df.drop(columns = drop_cols)

In [None]:
feature = 'Age'
df_subset = df[df[feature] >= 30]
np.sort(df_subset['Age'].unique())

In [None]:
feature = 'Housing'
df[df[feature] == 'own'][feature].unique()

In [None]:
feature = 'Purpose'
des_values = ['car', 'education']
df[df[feature].isin(des_values)][feature].unique()

In [None]:
feature = 'Saving accounts'
df[~df[feature].isna()]

In [None]:
### np.where vs np.select
### np.where is used where 1 or 2 values are returned, but in case of multiple values , np.select is used

In [None]:
criteria = [df['Age'] <30, (df['Age']<45)&(df['Age']>=30), df['Age']>=45]
assign_vals = [1,2,3]
df['Age_binned'] = np.select(criteria, assign_vals)

In [None]:
df.head()

In [None]:
### pivot_table, default agg_func is 'mean'
df.pivot_table(columns='Housing', values='Credit amount')