# Income Analysis

In this project I will perform a basic analysis of the incomes based off the input data

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier


In [None]:
# Download files and put into dataframes

data = pd.read_csv('/kaggle/input/income-adult/adult_data.csv')
df = pd.DataFrame(data)

data_test = pd.read_csv('/kaggle/input/income-adult/adult_test.csv')
df_test = pd.DataFrame(data_test)

In [None]:
df

In [None]:
# Needed to rename the columns so that they didnt have ' ' in front of each column name

old_col_names = df.columns.to_list()
new_col_names = [i.strip() for i in old_col_names]
renaming = dict(zip(old_col_names, new_col_names))


df = df.rename(columns = renaming )
df_test = df_test.rename(columns = renaming)

In [None]:
df.info()

## Basic graphing of the data

In this section I will graph the categorical and continuous data so that it is easier to visualise

In [None]:
# Categorical data

for i in new_col_names:
    if df.dtypes[i]==np.object:
        y1 = dict(df[ df['salary']==' <=50K'][i].value_counts())
        y2 = dict(df[ df['salary']==' >50K'][i].value_counts())
        x = y1.keys() if len(y1.keys()) >= len(y2.keys()) else y2.keys()
        if len(y1.keys()) != len(y2.keys()):
            for i in x:
                if i not in y2.keys():
                    y2[i] = 0
                if i not in y1.keys():
                    y1[i] = 0
        
        plt.bar(x, list(y1.values()), color='r', label = 'Lower Salary')
        plt.bar(x, list(y2.values()), bottom=list(y1.values()), color='b', label = 'Upper Salary')
        plt.xticks(rotation=90)
        plt.legend()
        plt.title(i)
        plt.show()

In [None]:
# continuous data
for i in new_col_names:
    if df.dtypes[i]!=np.object:
        plt.hist(df[i],bins=20)
        plt.title(i)
        plt.show()

In [None]:
df.info()

In [None]:
# Will now en code the cat data so that I can perform a corrilation heatmap
# Will store all cat codes into a dict so that they can be used on the test data

codes = {}
for i in new_col_names:
    curr_codes = dict()
    if df.dtypes[i]==np.object:
        curr_codes = dict(enumerate(df[i].astype('category').cat.categories))
        df[i] = df[i].astype('category').cat.codes
        curr_codes = dict([(value, key) for key, value in curr_codes.items()])
        codes[i] = curr_codes

In [None]:
sns.heatmap(df.corr())

## Model

For now I will build a Random Forest model and test withe the test data provided

In [None]:
y = df.salary
X = df.drop(columns=['salary'])

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X,y)

In [None]:
# All of the test data salarys had a '.' on the end, so had to remove

df_test.salary = [i[:-1] for i in df_test.salary]

In [None]:
# en coded the data so that it is the same as the training data
for i in new_col_names:
    if df_test.dtypes[i]==np.object:
        df_test[i] = df_test[i].map(codes[i])

In [None]:
# Find predictions
y_test = df_test.salary
X_test = df_test.drop(columns=['salary'])

pred = clf.predict(X_test)

In [None]:
# find the score
clf.score(X_test,y_test)

While this model gives a fairly high score, it can easly be seen that this is a product of the higher proportion of lower salarys to higher salarys.

I hope to improve this score with new models and perhaps adjusting the training preporsions between lower and upper salarys

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0].hist(pred);
ax[0].set_title('Predictions')
ax[1].hist(y_test);
ax[1].set_title('Test values')