In [5]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn import linear_model
import matplotlib.pyplot as plt

%matplotlib inline

In [6]:
df = pd.read_csv("census-income.data.csv", sep=",", names=['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status','occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])

In [7]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [8]:
df['in'].value_counts()

KeyError: 'in'

In [None]:
df.isnull().sum()

In [None]:
df.replace(' ?', np.nan, inplace=True)
df.replace(' <=50K', '<=50K', inplace=True)
df.replace(' >50K', '>50K', inplace=True)


In [None]:
100* df.isnull().sum() / len(df)

In [None]:
len(df)

In [None]:
df = df[df['workclass'].notnull()]
df = df[df['occupation'].notnull()]
df = df[df['native-country'].notnull()]

100* df.isnull().sum() / len(df)

In [None]:
below_50 = df[df['income'] == '<=50K']
above_50 = df[df['income'] == '>50K']

In [None]:
len(below_50)

In [None]:
frames = [below_50.head(10000), above_50]

In [None]:
df_actual = pd.concat(frames)

In [None]:
gender_split = pd.get_dummies(df_actual['gender'])

In [None]:
occupation_split = pd.get_dummies(df_actual['occupation'])

In [None]:
rel_split = pd.get_dummies(df_actual['relationship'])


In [None]:
race_split = pd.get_dummies(df_actual['race'])

In [None]:
work_split = pd.get_dummies(df_actual['workclass'])

In [None]:
country_split = pd.get_dummies(df_actual['native-country'])

In [None]:
df_dummies = pd.concat([df_actual,gender_split, occupation_split, rel_split,race_split, work_split, country_split], axis=1)

In [None]:
df_dummies.columns

In [None]:
col_names = list(df_dummies.columns)

In [None]:
col_names_strip = [a.strip() for a in col_names]
df_dummies.columns = col_names_strip

In [None]:
df_dummies['income'].value_counts()

In [None]:
df_dummies['income'] = df_dummies['income'].map({'<=50K':0, '>50K':1})

In [None]:
df_dummies['income'].value_counts()

In [None]:
df_dummies = df_dummies.select_dtypes(['number'])

In [None]:
df_dummies.head()

In [None]:
df_dummies['fnlwgt'] = df_dummies['fnlwgt'].pipe(zscore)
df_dummies['age'] = df_dummies['age'].pipe(zscore)
df_dummies['hours-per-week'] = df_dummies['hours-per-week'].pipe(zscore)

In [None]:
df_dummies.head(20)

In [None]:
y = df_dummies['income']

In [None]:
cols = list(df_dummies.columns)
cols.remove('income')

X = df_dummies[cols]

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


In [None]:
regression_model = linear_model.LinearRegression()
regression_model.fit(X_train, y_train)

In [None]:
regression_model.intercept_


In [None]:
regression_model.score(X_test, y_test)
