In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mserr

In [3]:
# Read train and test data into data frame
trainx_df = pd.read_csv("train.csv", index_col="Id")
trainy_df = trainx_df['SalePrice']
trainx_df.drop('SalePrice', axis=1, inplace=True)
testx_df = pd.read_csv("test.csv")

In [4]:
print(trainx_df.shape)
print(testx_df.shape)
print(trainy_df.shape)

(1460, 79)
(1459, 80)
(1460,)


In [5]:
sample_size = len(trainx_df)

In [6]:
# get all columns with null values, below command gives avg of null values in that column

columns_with_null_values = [[col, float(trainx_df[col].isnull().sum()) / float(sample_size)] for col in trainx_df.columns if trainx_df[col].isnull().sum()]
print(columns_with_null_values)

[['LotFrontage', 0.1773972602739726], ['Alley', 0.9376712328767123], ['MasVnrType', 0.005479452054794521], ['MasVnrArea', 0.005479452054794521], ['BsmtQual', 0.025342465753424658], ['BsmtCond', 0.025342465753424658], ['BsmtExposure', 0.026027397260273973], ['BsmtFinType1', 0.025342465753424658], ['BsmtFinType2', 0.026027397260273973], ['Electrical', 0.0006849315068493151], ['FireplaceQu', 0.4726027397260274], ['GarageType', 0.05547945205479452], ['GarageYrBlt', 0.05547945205479452], ['GarageFinish', 0.05547945205479452], ['GarageQual', 0.05547945205479452], ['GarageCond', 0.05547945205479452], ['PoolQC', 0.9952054794520548], ['Fence', 0.8075342465753425], ['MiscFeature', 0.963013698630137]]


In [7]:
# get all columns to drop where the average is > .3

columns_to_drop = [x for (x,y) in columns_with_null_values if y > .3]
print(columns_to_drop)

['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']


In [8]:
# drop columns from trainx_df and testx_df

trainx_df.drop(columns_to_drop, axis=1, inplace=True)
testx_df.drop(columns_to_drop, axis=1, inplace=True)
print(trainx_df.shape, testx_df.shape)

(1460, 74) (1459, 75)


In [9]:
# seperate categorical columns and non categorical columns(ordinal)
categorical_columns = [col for col in trainx_df.columns if trainx_df[col].dtype == object]
# categorical_columns.append("MSSubClass")
print(len(categorical_columns))
ordinal_columns = [col for col in trainx_df.columns if col not in categorical_columns]
print(len(ordinal_columns))

38
36


In [10]:
dummy_row = list()
for col in trainx_df.columns:
    if(col in categorical_columns):
        dummy_row.append("dummy")
    else:
        dummy_row.append("")
print(dummy_row)

['', 'dummy', '', '', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', '', '', '', '', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', '', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', 'dummy', '', 'dummy', '', '', '', 'dummy', 'dummy', 'dummy', 'dummy', '', '', '', '', '', '', '', '', '', '', 'dummy', '', 'dummy', '', 'dummy', '', 'dummy', '', '', 'dummy', 'dummy', 'dummy', '', '', '', '', '', '', '', '', '', 'dummy', 'dummy']


In [11]:
new_row = pd.DataFrame([dummy_row], columns=trainx_df.columns)
trainx_df = pd.concat([trainx_df, new_row], axis=0, ignore_index=True)
testx_df = pd.concat([testx_df], axis=0, ignore_index=True)

In [12]:
for col in categorical_columns:
    trainx_df[col].fillna(value="dummy", inplace=True)
    testx_df[col].fillna(value="dummy", inplace=True)

In [13]:
enc = OneHotEncoder(drop="first",  sparse=False)

In [14]:
enc.fit(trainx_df[categorical_columns])
trainx_enc = pd.DataFrame(enc.transform(trainx_df[categorical_columns]))
testx_enc = pd.DataFrame(enc.transform(testx_df[categorical_columns]))

trainx_enc.columns = enc.get_feature_names(categorical_columns)
testx_enc.columns = enc.get_feature_names(categorical_columns)
print(trainx_enc)

      MSZoning_FV  MSZoning_RH  MSZoning_RL  MSZoning_RM  MSZoning_dummy  \
0             0.0          0.0          1.0          0.0             0.0   
1             0.0          0.0          1.0          0.0             0.0   
2             0.0          0.0          1.0          0.0             0.0   
3             0.0          0.0          1.0          0.0             0.0   
4             0.0          0.0          1.0          0.0             0.0   
...           ...          ...          ...          ...             ...   
1456          0.0          0.0          1.0          0.0             0.0   
1457          0.0          0.0          1.0          0.0             0.0   
1458          0.0          0.0          1.0          0.0             0.0   
1459          0.0          0.0          1.0          0.0             0.0   
1460          0.0          0.0          0.0          0.0             1.0   

      Street_Pave  Street_dummy  LotShape_IR2  LotShape_IR3  LotShape_Reg  \
0         

In [15]:
trainx_enc.to_csv("encoded.csv")

In [16]:
trainx_df = pd.concat([trainx_df[ordinal_columns], trainx_enc], axis=1, ignore_index=True)
testx_df = pd.concat([testx_df[ordinal_columns], testx_enc], axis=1, ignore_index=True)

In [17]:
trainx_df.drop(trainx_df.tail(1).index, inplace=True)

In [27]:
for col in trainx_df.columns:
    print(col, trainx_df[col].dtype)

0 object
1 object
2 object
3 object
4 object
5 object
6 object
7 object
8 object
9 object
10 object
11 object
12 object
13 object
14 object
15 object
16 object
17 object
18 object
19 object
20 object
21 object
22 object
23 object
24 object
25 object
26 object
27 object
28 object
29 object
30 object
31 object
32 object
33 object
34 object
35 object
36 float64
37 float64
38 float64
39 float64
40 float64
41 float64
42 float64
43 float64
44 float64
45 float64
46 float64
47 float64
48 float64
49 float64
50 float64
51 float64
52 float64
53 float64
54 float64
55 float64
56 float64
57 float64
58 float64
59 float64
60 float64
61 float64
62 float64
63 float64
64 float64
65 float64
66 float64
67 float64
68 float64
69 float64
70 float64
71 float64
72 float64
73 float64
74 float64
75 float64
76 float64
77 float64
78 float64
79 float64
80 float64
81 float64
82 float64
83 float64
84 float64
85 float64
86 float64
87 float64
88 float64
89 float64
90 float64
91 float64
92 float64
93 float64
94 float64
9

In [18]:
imputer = KNNImputer(n_neighbors=2)
imputer.fit(trainx_df)
print(trainx_df)

     0   1      2   3   4     5     6    7    8     9    ...  260  261  262  \
0     60  65   8450   7   5  2003  2003  196  706     0  ...  0.0  0.0  1.0   
1     20  80   9600   6   8  1976  1976    0  978     0  ...  0.0  0.0  1.0   
2     60  68  11250   7   5  2001  2002  162  486     0  ...  0.0  0.0  1.0   
3     70  60   9550   7   5  1915  1970    0  216     0  ...  0.0  0.0  1.0   
4     60  84  14260   8   5  2000  2000  350  655     0  ...  0.0  0.0  1.0   
...   ..  ..    ...  ..  ..   ...   ...  ...  ...   ...  ...  ...  ...  ...   
1455  60  62   7917   6   5  1999  2000    0    0     0  ...  0.0  0.0  1.0   
1456  20  85  13175   6   6  1978  1988  119  790   163  ...  0.0  0.0  1.0   
1457  70  66   9042   7   9  1941  2006    0  275     0  ...  0.0  0.0  1.0   
1458  20  68   9717   5   6  1950  1996    0   49  1029  ...  0.0  0.0  1.0   
1459  20  75   9937   5   6  1965  1965    0  830   290  ...  0.0  0.0  1.0   

      263  264  265  266  267  268  269  
0     0.0

In [19]:
trainx_df_filled = imputer.transform(trainx_df)
trainx_df_filled = pd.DataFrame(trainx_df_filled, columns=trainx_df.columns)
print(trainx_df_filled)

       0     1        2    3    4       5       6      7      8       9    \
0     60.0  65.0   8450.0  7.0  5.0  2003.0  2003.0  196.0  706.0     0.0   
1     20.0  80.0   9600.0  6.0  8.0  1976.0  1976.0    0.0  978.0     0.0   
2     60.0  68.0  11250.0  7.0  5.0  2001.0  2002.0  162.0  486.0     0.0   
3     70.0  60.0   9550.0  7.0  5.0  1915.0  1970.0    0.0  216.0     0.0   
4     60.0  84.0  14260.0  8.0  5.0  2000.0  2000.0  350.0  655.0     0.0   
...    ...   ...      ...  ...  ...     ...     ...    ...    ...     ...   
1455  60.0  62.0   7917.0  6.0  5.0  1999.0  2000.0    0.0    0.0     0.0   
1456  20.0  85.0  13175.0  6.0  6.0  1978.0  1988.0  119.0  790.0   163.0   
1457  70.0  66.0   9042.0  7.0  9.0  1941.0  2006.0    0.0  275.0     0.0   
1458  20.0  68.0   9717.0  5.0  6.0  1950.0  1996.0    0.0   49.0  1029.0   
1459  20.0  75.0   9937.0  5.0  6.0  1965.0  1965.0    0.0  830.0   290.0   

      ...  260  261  262  263  264  265  266  267  268  269  
0     ...  0.

In [20]:
testx_df_filled = imputer.transform(testx_df)
testx_df_filled = pd.DataFrame(testx_df_filled, columns=testx_df.columns)

In [21]:
testx_df_filled.reset_index(drop=True, inplace=True)

In [22]:
print(trainx_df_filled.isnull().sum())

0      0
1      0
2      0
3      0
4      0
      ..
265    0
266    0
267    0
268    0
269    0
Length: 270, dtype: int64


In [23]:
scalar = preprocessing.StandardScaler().fit(trainx_df_filled)
trainx_df_filled = scalar.transform(trainx_df_filled)
testx_df_filled = scalar.transform(testx_df_filled)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(trainx_df_filled, trainy_df.values.ravel(), test_size = 0.3, random_state = 42)

In [25]:
reg = LinearRegression().fit(X_train, y_train)
print(reg.score(X_train, y_train))
print(reg.score(X_test, y_test))

0.9337689448119499
-5.945245940267123e+25
