In [101]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr


%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [102]:
train = pd.read_csv("train.csv") # read train data
test = pd.read_csv("test.csv") # read test data

In [103]:
#log transform the features
train["SalePrice"] = np.log1p(train["SalePrice"])

def logTransform (train,test):
    
    numerical_features=test.select_dtypes(include=["float","int","bool"]).columns.values
    dataframes = [train,test]
    
    for df in dataframes:
        
        skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna())) #compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        df[skewed_feats] = np.log1p(df[skewed_feats])
        
    return train,test


In [104]:
def get_features(train, test):
    trainval = list(train.columns.values) # list train features
    testval = list(test.columns.values) # list test features
    output = list(set(trainval) & set(testval)) # check wich features are in common (remove the outcome column)
    output.remove('Id') # remove non-usefull id column

    return output

def process_features(train,test):
    tables=[test,train]
    print ("Handling missing values...")
    total_missing=train.isnull().sum()
    to_delete=total_missing[total_missing>(1460/3.)] # select features with more than 1/3 missing values
    for table in tables:
        table.drop(to_delete.index.tolist(),axis=1, inplace=True)
            
    print ("Filling Nan...")
    numerical_features=test.select_dtypes(include=["float","int","bool"]).columns.values
    categorical_features=train.select_dtypes(include=["object"]).columns.values
    for table in tables: 
        for feature in numerical_features: 
            table[feature].fillna(train[feature].median(), inplace = True) # replace by median value
        for feature in categorical_features: 
            table[feature].fillna(train[feature].value_counts().idxmax(), inplace = True) # replace by most frequent value

    #print ("Handling categorical features...")
    #for feature in categorical_features: # Encode categorical features
    #   le = preprocessing.LabelEncoder()
    #   le.fit(train[feature])
    #   for table in tables: 
    #       table[feature]=le.transform(table[feature])
    
    print ("Getting features...")
    features = get_features(train,test)
    
    return train,test,features



In [105]:
def getDummies(df):
    
    le = preprocessing.LabelEncoder()
    df_str = df.select_dtypes(include=['object']) 
    df.drop(df_str.columns.values,axis=1,inplace=True)  
    df_str_dum = pd.get_dummies(df_str)
    
    return df_str_dum

def labelEncoding(train,test):
    
    train_str_dum = getDummies(train)
    test_str_dum  = getDummies(test)
    columns_dum = list(set(train_str_dum) & set(test_str_dum))

    train_str_dum = train_str_dum[columns_dum]
    test_str_dum = test_str_dum[columns_dum]

    #New train and New test
    train_flo = train.select_dtypes(exclude=['object'])
    test_flo = test.select_dtypes(exclude=['object']) 

    new_train = pd.merge(train_flo,train_str_dum,left_index=True,right_index=True)
    new_test = pd.merge(test_flo,test_str_dum,left_index=True,right_index=True)
    
    return new_train , new_test
    
    

In [106]:
def DimensionalityReduction(new_train,new_test):
    
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD, SparsePCA
    
    train_clf = new_train.copy()
    train_clf.drop('Id',axis=1,inplace=True)
    index = pd.DataFrame(test.Id,columns = ['Id'])
    test_clf = new_test.drop('Id',axis=1)
    train_clf1 = train_clf.copy()
    train_clf1['tot_sf'] = train_clf1['TotalBsmtSF'] + train_clf1['GrLivArea']
    train_clf1['ratio_fl'] = train_clf1['2ndFlrSF'] / train_clf1['1stFlrSF'] 
    train_clf1['garage_ex'] = (train_clf1['GarageQual_Gd'] + train_clf1['GarageQual_TA'] + train_clf1['GarageQual_Fa'] + train_clf1['GarageQual_Po']) * (train_clf1['GarageCond_Ex'])
  
    clus = KernelPCA(n_components = 25)
    train_clf2_pca = clus.fit_transform(train_clf1)
    train_clf2_pca = pd.DataFrame(train_clf2_pca)
    print(train_clf2_pca.shape)
    print(train_clf2_pca.columns)
    print(train_clf1.columns)
    train_clf2 = pd.merge(train_clf1,train_clf2_pca,left_index=True,right_index=True)

    test_clf2 = test_clf.drop(['LotFrontage','MasVnrArea','GarageYrBlt'],axis=1)
    test_clf2 = pd.merge(test_clf2,test[['LotFrontage','MasVnrArea','GarageYrBlt']],left_index=True,right_index=True)
    test_clf2['tot_sf'] = test_clf2['TotalBsmtSF'] + test_clf2['GrLivArea']
    test_clf2['ratio_fl'] = test_clf2['2ndFlrSF']  / test_clf2['1stFlrSF']
    test_clf2['garage_ex'] = (test_clf2['GarageQual_Gd'] + test_clf2['GarageQual_TA'] + test_clf2['GarageQual_Fa'] + test_clf2['GarageQual_Po']) * (test_clf2['GarageCond_Ex'])

    test_clf3 = pd.merge(test_clf2,pd.DataFrame(clus.transform(test_clf2)),left_index=True,right_index=True)

    index = pd.DataFrame(test.Id,columns = ['Id'])
    
    return train_clf3,test_clf3
    

In [107]:
train,test,features = process_features(train,test)
new_train,_new_test = logTransform(train,test)
print(new_train.shape, new_test.shape)
new_train , new_test = labelEncoding(train,test)
print(new_train.shape, new_test.shape)

Handling missing values...
Filling Nan...
Getting features...
(1460, 76) (1459, 255)
(1460, 256) (1459, 255)


In [109]:
#new_train , new_test = DimensionalityReduction(new_train,new_test)

In [None]:
# Write to new files

new_train.to_csv('train_cleaned.csv',index=False)
new_test.to_csv('test_cleaned.csv',index=False)