In [55]:
# Imports

import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

pd.options.display.max_rows = 80

In [56]:
# Data 

data_path = Path('../data/project')
df = pd.read_csv(data_path/'train.csv', index_col = 'Id')
X_test = pd.read_csv(data_path/'test.csv', index_col = 'Id')

In [57]:
# Splitting features into types
target = 'SalePrice'
X = df.drop(columns=[target])
y = df[target]

num_features = X.dtypes[X.dtypes != 'object'].index
cat_features = X.dtypes[X.dtypes == 'object'].index
cardinality = X[cat_features].nunique()

print(f'Numeric features: {len(num_features)}, categorical features: {len(cat_features)}, total features: {X.shape[1]}')

Numeric features: 36, categorical features: 43, total features: 79


In [58]:
# applying log transform to the target variable
y = y.apply(lambda x: np.log(x))

In [59]:
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

# Defining our main preprocessing function:
def preprocess(X):
    X = X.copy()
    
    # Changing all the NaN values to zeros
    X = X.fillna(0)

    # Encoding categorical encoders ordinally
    X_cat = X[cat_features]
    oe = ce.OrdinalEncoder()
    X_cat = oe.fit_transform(X_cat)

    # Merging out numeric and categorical features back together
    X = pd.concat([X[num_features], X_cat], axis = 1)

    return X

X_preprocessed = preprocess(X)
X_test_pp = preprocess(X_test)



In [60]:
# Exporting our new data into csv files
path = '../data/project/'

# Training data
X_preprocessed.to_csv(path+'X_train_preprocessed.csv')
y.to_csv(path+'y_train_preprocessed.csv')

# Test data
X_test_pp.to_csv(path+'X_test_preprocessed.csv')
