In [11]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
import pickle
import json

In [53]:
def make_subset(filename):
    # read the dataset
    df = pd.read_excel(filename)
    
    # strip spaces
    cols = df.columns.str.replace(" ", "")
    df.columns = cols
    
    # select feature & label cols only
    feature_cols = ['전용면적',
                    'D-3월_매출액',
                    '유동인구반경300m',
                    '직장인구성격_30대',
                    'D-5월_매출액',
                    '직장인구',
                    '주거인구성격_30대',
                    '유동인구성격순위_1순위',
                    '직장인구성격_50대',
                    'D-1월_매출액',
                    '지역매출',
                    '직장인구성격_20대',
                    '직장인구성격순위_1순위',
                    '유동인구성격_50대',
                    'D-2월_매출액',
                    '주거인구성격_50대',
                    'D-4월_매출액',
                    'D월_매출액',
                    '환산보증금']
    
    # intersection
    df = df[df.columns.intersection(feature_cols)]
    
    new_cols = {
            '전용면적': 'private_size',
            'D-3월_매출액': 'd-3mth_sales',
            '유동인구반경300m': 'movers_300m',
            '직장인구성격_30대': 'workers_30s',
            'D-5월_매출액': 'd-5mth_sales',   
            '직장인구': 'workers',
            '주거인구성격_30대': 'residents_30s',
            '유동인구성격순위_1순위': 'movers_1st',
            '직장인구성격_50대':'workers_50s',
            'D-1월_매출액':'d-1mth_sales',
            '지역매출':'local_sales',
            '직장인구성격_20대':'workers_20s',
            '직장인구성격순위_1순위':'workers_1st',
            '유동인구성격_50대':'movers_50s',
            'D-2월_매출액':'d-2mth_sales',
            '주거인구성격_50대':'residents_50s',
            'D-4월_매출액':'d-4mth_sales',
            'D월_매출액':'d-mth_sales',
            '환산보증금': 'rent_price'
            }
    
    df.rename(columns=new_cols, inplace=True)

    return df

In [54]:
df = make_subset('data_trail_v2.xlsx')

In [55]:
df.head()

Unnamed: 0,workers,movers_300m,movers_50s,residents_30s,residents_50s,workers_20s,workers_30s,workers_50s,d-5mth_sales,d-4mth_sales,d-3mth_sales,d-2mth_sales,d-1mth_sales,d-mth_sales,local_sales,private_size,movers_1st,workers_1st,rent_price
0,2621,29457.0,0.201,0.179,0.138,0.178,0.248,0.209,1473.0,1570.0,1536.0,1634.0,1566,1587,1587,761.7,40,40,33000
1,0,12917.0,0.167,0.106,0.162,0.0,0.0,0.0,1748.0,2033.0,2152.0,2368.0,2322,2504,2504,34.0,30,20,35000
2,0,12917.0,0.167,0.106,0.162,0.0,0.0,0.0,1748.0,2033.0,2152.0,2368.0,2322,2504,2504,28.0,30,20,30000
3,198,11654.0,0.229,0.091,0.205,0.182,0.248,0.182,2411.0,3706.0,4964.0,3629.0,2632,3584,3584,52.0,40,30,90000
4,198,11654.0,0.229,0.091,0.205,0.182,0.248,0.182,2411.0,3706.0,4964.0,3629.0,2632,3584,3584,67.0,40,30,40000


In [56]:
def preprocessing(df):
    
    # missing values
    df = df.fillna(df.median())
    
    # outliers
    df_num = df._get_numeric_data()
    normal_idx = df_num[(np.abs(stats.zscore(df_num)) < 3).all(axis=1)].index
    df = df.loc[normal_idx]
    
    return df

In [57]:
df = preprocessing(df)

In [58]:
df.head()

Unnamed: 0,workers,movers_300m,movers_50s,residents_30s,residents_50s,workers_20s,workers_30s,workers_50s,d-5mth_sales,d-4mth_sales,d-3mth_sales,d-2mth_sales,d-1mth_sales,d-mth_sales,local_sales,private_size,movers_1st,workers_1st,rent_price
3,198,11654.0,0.229,0.091,0.205,0.182,0.248,0.182,2411.0,3706.0,4964.0,3629.0,2632,3584,3584,52.0,40,30,90000
4,198,11654.0,0.229,0.091,0.205,0.182,0.248,0.182,2411.0,3706.0,4964.0,3629.0,2632,3584,3584,67.0,40,30,40000
6,1454,28452.0,0.242,0.147,0.164,0.074,0.144,0.307,1755.0,1866.0,1872.0,1830.0,1935,1675,1675,72.0,50,50,16000
8,100,8132.0,0.175,0.168,0.154,0.04,0.1,0.24,2919.0,3354.0,3841.0,3502.0,3540,3620,3620,16.0,30,40,18000
9,3441,15281.0,0.211,0.176,0.176,0.179,0.283,0.182,2845.0,2866.0,3257.0,3498.0,3483,3435,3435,20.0,40,40,17000


In [66]:
def feature_scaler(df):    
    # scaling
    X = df.drop(['rent_price'], axis=1).copy()

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X, scaler

In [67]:
X, scaler = feature_scaler(df)
y = df['rent_price']

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [68]:
model = Lasso(alpha=10).fit(X, y)

In [69]:
# export model
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# export scaler
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))