In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import urllib.request
import tarfile
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

print('-------------------')
print('|     lab1         |')
print('-------------------')

url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"
urllib.request.urlretrieve(url, "housing.tgz")  # save in a file
import tarfile

tar = tarfile.open("housing.tgz")
tar.extractall()
tar.close()
!head -10 housing.csv
housing = pd.read_csv("housing.csv")
print(housing.describe())
print(housing.info())
print(housing['median_income'])

-------------------
|     lab1         |
-------------------
longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.6

In [4]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [5]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(test_size=0.2, random_state=42)

In [6]:
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [7]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [8]:
housing = strat_train_set.drop("median_house_value", axis=1)  # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

In [12]:
col_names = ["total_rooms", "total_bedrooms", "population", "households"]
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names]

In [15]:
from sklearn.preprocessing import LabelEncoder

class CombinedAttributesAdder():


    def fit_and_transform(self, df):
        return self.DataHandler(df)
        
    @staticmethod
    def NullHandler(df: pd.DataFrame):

        #null값이 없습니다.
        return df

    @staticmethod
    def Dropfeatures(df: pd.DataFrame):
        #drop이 필요 없습니다.
        return df
    
    @staticmethod
    def Encoder(df: pd.DataFrame):
        feature = 'ocean_proximity'
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
        return df
    
    @staticmethod
    def Norm(df: pd.DataFrame):
        numeric_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                            'total_bedrooms', 'population', 'households']
        for feature in numeric_features:
            sk = StandardScaler()
            df[feature] = sk.fit_transform(df[[feature]])
        return df
    

    def DataHandler(self, df: pd.DataFrame):
        df = self.NullHandler(df)
        df = self.Dropfeatures(df)
        df = self.Norm(df)
        df = self.Encoder(df)
        return df
    

In [17]:
FeatureEng = CombinedAttributesAdder()
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
19712,-121.64,39.15,15.0,2659.0,396.0,1159.0,407.0,5.2340,INLAND
7414,-118.23,33.95,43.0,1683.0,520.0,2190.0,494.0,2.2391,<1H OCEAN
3110,-117.70,35.60,16.0,2678.0,483.0,1473.0,487.0,3.8580,INLAND
20210,-119.23,34.30,18.0,1713.0,244.0,690.0,239.0,6.9483,NEAR OCEAN
4766,-118.36,34.03,43.0,1690.0,379.0,1017.0,359.0,2.1078,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
19039,-121.99,38.36,33.0,146.0,31.0,75.0,31.0,3.5179,INLAND
10454,-117.66,33.48,22.0,809.0,180.0,334.0,157.0,2.3846,<1H OCEAN
3353,-121.03,40.35,52.0,5486.0,1044.0,1977.0,754.0,2.1833,INLAND
2797,-118.18,36.63,23.0,2311.0,487.0,1019.0,384.0,2.2574,INLAND
