In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
ds = pd.read_csv("../data/census-us.csv")

In [None]:
ds.describe()

In [None]:
ds.dtypes

In [None]:
ds["workclass"].unique()

In [None]:
values = ds["workclass"].unique()
encoded_value = 0
for value in values:
    ds.loc[ds["workclass"] == value, "xxx_encoded_workclass"] = encoded_value
    encoded_value = encoded_value + 1

In [None]:
ds.dtypes

In [None]:
ds["xxx_encoded_workclass"].unique()

In [None]:
print(ds.describe())

In [None]:
ds.dtypes

In [None]:
def get_encoded_variable_name(variable_name):
    encoded_variable_name = "___" + "encoded" + "_" + variable_name
    return encoded_variable_name

def encode_variable(variable_name):
    encoded_variable_name = get_encoded_variable_name(variable_name)
  
    # Uses the "ordinal" of the variable values as the encoded form for the nominal variable
    values = ds[variable_name].unique()
    encoded_value = 0
    for value in values:
        ds.loc[ds[variable_name] == value, encoded_variable_name] = encoded_value
        encoded_value += 1

In [None]:
nominal_variables = ["workclass","education","marital-status","occupation","relationship","race","sex","native-country"]

In [None]:
for nominal_variable in nominal_variables:
    encode_variable(nominal_variable)

In [None]:
ds.describe()

In [None]:
numerical_variables = ["age","fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

In [None]:
target_variable = "class"

In [None]:
all_variables = numerical_variables + list(map(get_encoded_variable_name, nominal_variables))

In [None]:
split = cross_validation.ShuffleSplit(ds.shape[0], n_iter=1, random_state=1, test_size=.33)

In [None]:
for estimation, validation in split:
    print("Estimation: {}".format(estimation))
    print("Validation: {}".format(validation))

In [None]:
algorithm = LinearRegression()

In [None]:
for estimation_indices, validation_indices in split:
    print("=============================================================================")
    print("Estimation variable: {}".format(ds[all_variables].iloc[estimation_indices,:]))
    print("Estimation target: {}".format(ds[target_variable].iloc[estimation_indices]))
    print("=============================================================================")
    print("Validation variables: {}".format(ds[all_variables].iloc[validation_indices,:]))
    print("Validation target: {}".format(ds[target_variable].iloc[validation_indices]))
    print("=============================================================================")
    

In [None]:
    for estimation_indices, validation_indices in split:

        #
        # Extracts the records based on the split indices
        estimation_records = ds[all_variables].iloc[estimation_indices,:]
        estimation_target = ds[target_variable].iloc[estimation_indices]

        #
        # Computes the model
        algorithm.fit(estimation_records, estimation_target)

        #
        # Estimates the model performance by applying it to the validation dataset
        validation_predictions = algorithm.predict(ds[all_variables].iloc[validation_indices,:])

        validation_predictions[validation_predictions > .5] = 1
        validation_predictions[validation_predictions <= .5] = 0
        acc = accuracy_score(ds[target_variable].iloc[validation_indices], validation_predictions)
        print("Accuracy : {}".format(acc))
