In [3]:
import pandas, numpy as np
from bokeh.plotting import figure, show, output_file
from bokeh.layouts import column




census = pandas.read_csv("test_data.csv")


In [5]:
census.head(10)

Unnamed: 0,Age,Class,Fnlwgt,Education,Education_num,Married,Occupation,Relationship,Race,Sex,Asset_gain,Asset_loss,Hours_per_week,Native_country,>50k
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K.
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K.
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K.
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K.


In [4]:
# get rid of spaces in the dataset

categories = [category for category in census.columns if type(census[category][0]) == str]
for category in categories:
    for i in range(len(census[category])):
        census[category][i] = census[category][i].replace(" ", "")

        
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  census[category][i] = census[category][i].replace(" ", "")


In [5]:
sex_to_binary = {"Male" : 0, "Female" : 1}
census["Sex"] = census["Sex"].map(sex_to_binary)
census.head()

Unnamed: 0,Age,Class,Fnlwgt,Education,Education_num,Married,Occupation,Relationship,Race,Sex,Asset_gain,Asset_loss,Hours_per_week,Native_country,>50k
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,0,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,0,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,0,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,0,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,1,0,0,30,United-States,<=50K.


In [6]:
income_to_binary = {"<=50K." : 0, ">50K." : 1}

census[">50k"] = census[">50k"].map(income_to_binary)

In [259]:
census.head()

Unnamed: 0,Age,Class,Fnlwgt,Education,Education_num,Married,Occupation,Relationship,Race,Sex,Asset_gain,Asset_loss,Hours_per_week,Native_country,>50k
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,0,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,0,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,0,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,0,7688,0,40,United-States,1
4,18,,103497,Some-college,10,Never-married,,Own-child,White,1,0,0,30,United-States,0


In [249]:
# data exploration for age

def numerical_plot(x, y):
    plot = figure()
    plot.circle(x, y)
    show(plot)
    
def categorical_plot(x, y):
    plot = figure(x_range = x, y_range = [0, 1])
    plot.vbar(x = x, top = y, width=0.9)
    show(plot)

def combine(dataset, lower, higher):
    combined = 0
    for i in range(lower, higher + 1):
        combined += dataset[i]
        
    return round(combined/(higher-lower), 3)


oldest = max(census["Age"])
youngest = min(census["Age"])
people_per_age = {}

for i in range(oldest):
    people_of_age_i = len(census[census["Age"] == i])
    people_per_age[i] = people_of_age_i

    


x = [num for num in people_per_age.keys()]
y = [num for num in people_per_age.values()]

numerical_plot(x, y)





percentage_survived = {}

for i in range(youngest, oldest):
    total_people = len(census[census["Age"] == i])
    greater_income = len(census[(census["Age"] == i) & (census[">50k"] == 1)])
    try:
        percentage_survived[i] = greater_income/total_people
    except ZeroDivisionError:
        percentage_survived[i] = 0

        
percentage_survived_by_age_group = {}

for i in range (youngest, oldest - 10, 10):
    lower = i
    higher = i + 10
    percentage_survived_by_age_group["{}-{}".format(lower, higher)] = combine(percentage_survived, lower, higher)





x = [item for item in percentage_survived_by_age_group.keys()]
y = [item for item in percentage_survived_by_age_group.values()]

categorical_plot(x,y)


# conclusion:
# age is a useful factor in determining whether or not somebody's income is greater than 50k
    

In [7]:
# replace '?' with NaN so we can look for whether data is missing at random, or there is some pattern (using .isnull()

with_missing_values = []
for category in census.columns:
    total_missing = len(census[census[category] == "?"])
    print(total_missing)
    if total_missing  > 0:
        with_missing_values.append(category)



for category in with_missing_values:
    for i in range(len(census[category])):

        if census[category][i] == "?":
            census[category][i] = np.nan



0
963
0
0
0
0
966
0
0
0
0
0
0
274
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  census[category][i] = np.nan


In [8]:
for category in with_missing_values:
    print(census.groupby(census[category].isnull()).mean()[">50k"])
census["Class_ind"] = np.where(census["Class"].isnull(), 0, 1)
census["Occupation_ind"] = np.where(census["Occupation"].isnull(), 0, 1)
census.head(10)

Class
False    0.246246
True     0.076843
Name: >50k, dtype: float64
Occupation
False    0.246294
True     0.076605
Name: >50k, dtype: float64
Native_country
False    0.235647
True     0.270073
Name: >50k, dtype: float64


Unnamed: 0,Age,Class,Fnlwgt,Education,Education_num,Married,Occupation,Relationship,Race,Sex,Asset_gain,Asset_loss,Hours_per_week,Native_country,>50k,Class_ind,Occupation_ind
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,0,0,0,40,United-States,0,1,1
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,0,0,0,50,United-States,0,1,1
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,0,0,0,40,United-States,1,1,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,0,7688,0,40,United-States,1,1,1
4,18,,103497,Some-college,10,Never-married,,Own-child,White,1,0,0,30,United-States,0,0,0
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,0,0,0,30,United-States,0,1,1
6,29,,227026,HS-grad,9,Never-married,,Unmarried,Black,0,0,0,40,United-States,0,0,0
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,0,3103,0,32,United-States,1,1,1
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,1,0,0,40,United-States,0,1,1
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,0,0,0,10,United-States,0,1,1


In [9]:
class Category:
    def __init__(self, category, is_numerical = False):
        self.category = category
        self.dataset = census[category]
        self.distribution = self.get_distribution()
        self.counts = self.get_counts()
        self.is_numerical = is_numerical
    def get_distribution(self):
        x_to_y = {}
        all_values = set()
        
        for value in self.dataset:
            if type(value) != float: # if value is NaN
                all_values.add(value)
            
        all_values = list(all_values)
        
        for value in all_values: 
        
            total_people = len(census[self.dataset == value])
            higher_income = len(census[(self.dataset == value) & (census[">50k"] == 1)])
            
            if total_people == 0:
                continue
                
            percentage_higher = higher_income/total_people
                
            x_to_y[value] = round(percentage_higher, 3)
        
        return x_to_y
    
    def get_counts(self):
        x_to_y = {}
        all_values = set()
        
        for value in self.dataset:
            if type(value) != float: # if value is NaN
                all_values.add(value)
            
        all_values = list(all_values)
        
        for value in all_values:
            people = len(census[self.dataset == value])
            x_to_y[value] = people
        
        return x_to_y
    
    def plot(self):
        x_range = None
        
        countsX = [val for val in self.counts.keys()]
        countsY = [val for val in self.counts.values()]
        
        distrX = [val for val in self.distribution.keys()]
        distrY = [val for val in self.distribution.values()]
        
        if self.is_numerical == False:
            x_range = countsX
            
        distribution_plot = figure(width = 1500, title = "Distribution of {}".format(self.category), y_range = [0,1], x_range = x_range)
        counts_plot = figure(width = 1500, title = "Counts of {}".format(self.category), x_range = x_range)
        
        
        
        distribution_plot.vbar(x = distrX, top = distrY, width = 0.9)
        
        
        
        counts_plot.vbar(x = countsX, top = countsY, width = 0.9)
        
        (distribution_plot, counts_plot)
        show(column(distribution_plot, counts_plot))
    
    
age = Category("Age", True)
native_country = Category("Native_country")
education = Category("Education")
education_num = Category("Education_num", True)
married = Category("Married")
occupation = Category("Occupation")
relationship = Category("Relationship")
race = Category("Race")
sex = Category("Sex", True)
hours_per_week = Category("Hours_per_week", True)

occupation.plot()
married.plot()
education_num.plot()
age.plot()
native_country.plot()
education.plot()
relationship.plot()
race.plot()
sex.plot()
hours_per_week.plot()
        
# sex, education_num, race, hours_per_week, age, married

In [2]:
race = Category("Race")
race.plot()

NameError: name 'census' is not defined

In [270]:
census.drop(["Fnlwgt", "Education", "Occupation", "Relationship", "Asset_gain", "Asset_loss", "Native_country"], axis = 1, inplace = True)
census.head(10)

Unnamed: 0,Age,Class,Education_num,Married,Race,Sex,Hours_per_week,>50k,Class_ind,Occupation_ind
0,25,Private,7,Never-married,Black,0,40,0,1,1
1,38,Private,9,Married-civ-spouse,White,0,50,0,1,1
2,28,Local-gov,12,Married-civ-spouse,White,0,40,1,1,1
3,44,Private,10,Married-civ-spouse,Black,0,40,1,1,1
4,18,,10,Never-married,White,1,30,0,0,0
5,34,Private,6,Never-married,White,0,30,0,1,1
6,29,,9,Never-married,Black,0,40,0,0,0
7,63,Self-emp-not-inc,15,Married-civ-spouse,White,0,32,1,1,1
8,24,Private,10,Never-married,White,1,40,0,1,1
9,55,Private,4,Married-civ-spouse,White,0,10,0,1,1


In [276]:
census.to_csv("census_data_cleaned.csv", index = False)

In [277]:
census.head(10)

Unnamed: 0,Age,Education_num,Race,Sex,Hours_per_week,>50k,Class_ind,Occupation_ind
0,25,7,Black,0,40,0,1,1
1,38,9,White,0,50,0,1,1
2,28,12,White,0,40,1,1,1
3,44,10,Black,0,40,1,1,1
4,18,10,White,1,30,0,0,0
5,34,6,White,0,30,0,1,1
6,29,9,Black,0,40,0,0,0
7,63,15,White,0,32,1,1,1
8,24,10,White,1,40,0,1,1
9,55,4,White,0,10,0,1,1
