# Task for Today  

***

## Household Income Prediction  

Given *data about households in Korea*, let's try to predict the **income** of a given household.  
  
We will use various regression models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

In [None]:
data = pd.read_csv('../input/korea-income-and-welfare/Korea Income and Welfare.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
data

In [None]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop id column
    df = df.drop('id', axis=1)
    
    # Encode missing values properly
    df = df.replace(' ', np.NaN)
    
    # One-hot encode categorical variables
    nominal_features = [
        ('region', "reg"),
        ('marriage', "mar"),
        ('religion', "rel"),
        ('occupation', "occ"),
        ('reason_none_worker', "rsn")
    ]
    for column, prefix in nominal_features:
        df = onehot_encode(df, column=column, prefix=prefix)
    
    # Fill company_size missing values with 0
    df['company_size'] = df['company_size'].fillna(0)
    
    # Split df into X and y
    y = df['income']
    X = df.drop('income', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
models = {
    "                   Linear Regression": LinearRegression(),
    "    L2-Regularized Linear Regression": Ridge(),
    "    L1-Regularized Linear Regression": Lasso(),
    "           Huber (Robust) Regression": HuberRegressor(),
    "Linear Kernel Support Vector Machine": LinearSVR(),
    "                       Decision Tree": DecisionTreeRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# Results

In [None]:
print("Model R^2 Scores (Test Set):")
for name, model in models.items():
    print(name + ": {:.4f}".format(model.score(X_test, y_test)))

# Optimizing Regularization Strength of L2 and L1 Regression Models

In [None]:
l2_reg_strength = 1.0

l2_model = Ridge(alpha=l2_reg_strength)
l2_model.fit(X_train, y_train)

print("Ridge Regression Test R^2 (alpha={}): {:.5f}".format(l2_reg_strength, l2_model.score(X_test, y_test)))

In [None]:
l1_reg_strength = 0.01

l1_model = Lasso(alpha=l1_reg_strength)
l1_model.fit(X_train, y_train)

print("Lasso Regression Test R^2 (alpha={}): {:.5f}".format(l1_reg_strength, l1_model.score(X_test, y_test)))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/2JwDkvWEVlM