# Problem 2: German Credit Dataset
## Preprocessing, Analysis, and Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Preprocessing (29 pts)

In [None]:
# Load the dataset
df = pd.read_csv("GermanCredit.csv")

### Task 1: Drop 3 columns with most 'none' values (8 pts)

In [None]:
none_counts = (df == 'none').sum()
cols_to_drop = none_counts.sort_values(ascending=False).index[:3]
df.drop(columns=cols_to_drop, inplace=True)
print(f"Dropped columns: {list(cols_to_drop)}")

### Task 2: Remove apostrophes (4 pts)

In [None]:
df = df.replace("'", "", regex=True)

### Task 3: Map checking_status (5 pts)

In [None]:
checking_map = {
    'no checking': 'No Checking',
    '<0': 'Low',
    '0<=X<200': 'Medium',
    '>=200': 'High'
}
df['checking_status'] = df['checking_status'].map(checking_map)

### Task 4: Map savings_status (5 pts)

In [None]:
savings_map = {
    'no known savings': 'No Savings',
    '<100': 'Low',
    '100<=X<500': 'Medium',
    '500<=X<1000': 'High',
    '>=1000': 'High'
}
df['savings_status'] = df['savings_status'].map(savings_map)

### Task 5: Map class column (3 pts)

In [None]:
df['class'] = df['class'].map({'good': 1, 'bad': 0})

### Task 6: Map employment (4 pts)

In [None]:
def map_employment(x):
    if x == 'unemployed':
        return 'Unemployed'
    elif x == '<1':
        return 'Amateur'
    elif x == '1<=X<4':
        return 'Professional'
    elif x == '4<=X<7':
        return 'Experienced'
    elif x == '>=7':
        return 'Expert'
    else:
        return x

df['employment'] = df['employment'].apply(map_employment)

## Analysis (17 pts)

### Task 1: Foreign Worker vs Class crosstab (3 pts)

In [None]:
print("Foreign Worker vs Class:")
print(pd.crosstab(df['foreign_worker'], df['class']))

### Task 2: Employment vs Savings Status crosstab (2 pts)

In [None]:
print("\nEmployment vs Savings Status:")
print(pd.crosstab(df['employment'], df['savings_status']))

### Task 3: Average credit amount for single males with 4-7 years employment (4 pts)

In [None]:
mask = (df['personal_status'] == 'male single') & (df['employment'] == 'Experienced')
print("\nAverage credit amount of single males with 4<=X<7 years employment:", 
      df.loc[mask, 'credit_amount'].mean())

### Task 4: Average credit duration for each job type (4 pts)

In [None]:
print("\nAverage credit duration for each job type:")
print(df.groupby('job')['duration'].mean())

### Task 5: Most common checking and savings status for education purpose (4 pts)

In [None]:
edu_df = df[df['purpose'] == 'education']
print("\nMost common checking status:", edu_df['checking_status'].mode()[0])
print("Most common savings status:", edu_df['savings_status'].mode()[0])

## Visualization (24 pts)

### Task 1: Bar charts - Savings/Checking Status vs Personal Status (9 pts)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14,6))

sns.countplot(x='savings_status', hue='personal_status', data=df, ax=axs[0])
axs[0].set_title('Savings Status vs Personal Status')

sns.countplot(x='checking_status', hue='personal_status', data=df, ax=axs[1])
axs[1].set_title('Checking Status vs Personal Status')

plt.tight_layout()
plt.show()

### Task 2: Bar graph - Property Magnitude vs Average Age (credit > 4000) (9 pts)

In [None]:
high_credit = df[df['credit_amount'] > 4000]
high_credit.groupby('property_magnitude')['age'].mean().plot(kind='bar')
plt.ylabel('Average Age')
plt.title('Average Age by Property Magnitude (Credit > 4000)')
plt.tight_layout()
plt.show()

### Task 3: Pie charts - High Savings & Age > 40 (6 pts)

In [None]:
subset = df[(df['savings_status'] == 'High') & (df['age'] > 40)]

fig, axs = plt.subplots(1, 3, figsize=(18,6))

subset['personal_status'].value_counts().plot(kind='pie', ax=axs[0], autopct='%1.1f%%')
axs[0].set_title('Personal Status')
axs[0].set_ylabel('')

subset['credit_history'].value_counts().plot(kind='pie', ax=axs[1], autopct='%1.1f%%')
axs[1].set_title('Credit History')
axs[1].set_ylabel('')

subset['job'].value_counts().plot(kind='pie', ax=axs[2], autopct='%1.1f%%')
axs[2].set_title('Job')
axs[2].set_ylabel('')

plt.tight_layout()
plt.show()