In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt

In [None]:
#Data Loading
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
print('train_dim:', train.shape)
train.head()

In [None]:
#Missing Values
train.isna().sum()[train.isna().sum()>0].sort_values(ascending=False)

In [None]:
#Missing Values percentage
Nan_features = [features for features in train.columns if train[features].isna().sum()>0]

for feature in Nan_features:
    print(feature, np.round(train[feature].isna().mean(),4),'% missing values')

In [None]:
#Relationship b/w missing value and sale price
for feature in Nan_features:
    df = train.copy()
    df[feature] = np.where(df[feature].isna(),1,0)
    
    df.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.show()

In [None]:
# Numerical variables

numerical_features = [features for features in train.columns if train[features].dtypes != 'O']
print("number of numerical features: ", len(numerical_features))
train[numerical_features].head()

In [None]:
# Temporal Variables/ Datetime Variables
Yr_features = [features for features in train.columns if 'Yr' in features or 'Year' in features]
print('number of Temporal Variables:', len(Yr_features))
train[Yr_features].head()

In [None]:
for feature in Yr_features:
    data = train.copy()
    data.groupby(feature)['SalePrice'].median().plot()
    plt.title('House Prize vs '+feature)
    plt.xlabel(feature)
    plt.ylabel('median of Saleprice')
    plt.show()
    

In [None]:
## Here we will compare the difference between All years feature with SalePrice

for feature in Yr_features:
    data = train.copy()
    if feature != 'YrSold':
        data[feature] = data['YrSold'] - data[feature]
        data.groupby(feature)['SalePrice'].median().plot()
        plt.title(feature)
        plt.xlabel(feature)
        plt.ylabel('median of sale price')
        plt.show()


In [None]:
## Here we will compare the difference between All years feature with SalePrice with scatter plot

for feature in Yr_features:
    data = train.copy()
    if feature != 'YrSold':
        data[feature] = data['YrSold'] - data[feature]
        plt.scatter(data[feature], data['SalePrice'])
        plt.title(feature)
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()
        


In [None]:
for feature in train.columns:
    print(feature,'has', train[feature].nunique(), 'unique values')

In [None]:
## Numerical variables are usually of 2 type
## 1. Continous variable and Discrete Variables

discrete_feature = [features for features in numerical_features if train[features].nunique()<50 and features not in Yr_features+['Id']]
print('number od discrete variables:', len(discrete_feature))
train[discrete_feature].head()

In [None]:
# Find the realtionship between them and Sale PRice

for feature in discrete_feature:
    data = train.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('median of sale price')
    plt.show()
    

In [None]:
# Continuous variables
continuous_features = [features for features in numerical_features if features not in discrete_feature and features not in Yr_features+['Id']]
print("number of continuous variables:", len(continuous_features))
train[continuous_features].head()


In [None]:
# Find the realtionship between them and Sale PRice
for feature in continuous_features:
    data = train.copy()
    plt.hist(data[feature])
    plt.show()

In [None]:
#checking skewness

from scipy.stats import skew 

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in train.columns:
    if train[i].dtype in numeric_dtypes:
        numerics2.append(i)
print(len(numerics2))

skew_features = train[numerics2].apply(lambda x: skew(x)).sort_values(ascending = False)
high_skewed  = skew_features[skew_features>0.5]
print(high_skewed.shape)
skew_index = high_skewed.index
high_skewed

In [None]:
# Find the realtionship between them and Sale PRice
for feature in continuous_features:
    data = train.copy()
    plt.figure(figsize=(10,5))
    sns.distplot(data[feature])
#     plt.title(feature)
#     plt.xlabel(feature)
#     plt.ylabel('median of SalePrice')
    plt.show()

### Log tranform

In [None]:
for feature in continuous_features:
    try:
        data = train.copy()
        data[feature]=np.log(data[feature])
        sns.distplot(data[feature])
        plt.show()
    except:
        pass

In [None]:
skew_log = data[continuous_features].apply(lambda x: skew(x)).sort_values(ascending=False)
skew_log

### Box-Cox 

In [None]:
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

for i in skew_index:
    train[i] = boxcox1p(train[i], boxcox_normmax(train[i] + 1))

In [None]:
#cheking skewness after applying box-cox
skew_boxcox = train[skew_index].apply(lambda x: skew(x)).sort_values(ascending=False)
skew_boxcox

In [None]:
# distplot after box-cox
# for feature in skew_index:
#     data = train.copy()
#     plt.figure(figsize=(10,5))
#     sns.distplot(data[feature])
# #     plt.title(feature)
# #     plt.xlabel(feature)
# #     plt.ylabel('median of SalePrice')
#     plt.show()

In [None]:
# for feature in continuous_features:
#     data = train.copy()
#     if 0 in data[feature].unique():
#         pass
#     else:
#         plt.scatter(data[feature], data['SalePrice'])
#         plt.title(feature)
#         plt.xlabel(feature)
#         plt.ylabel('SalePrice')
#         plt.show()

In [None]:
for feature in continuous_features:
    data = train.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data['SalePrice'] = np.log(data['SalePrice'])
        plt.scatter(data[feature], data['SalePrice'])
        plt.title(feature)
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()

In [None]:
#Outliers

for features in continuous_features:
    data = train.copy()
    if 0 in data[features].unique():
        pass
    else:
        data[features] = np.log(data[features])
        data.boxplot(column = features)
    #     plt.title(features)
    #     plt.xlabel(features)
    #     plt.ylabel(features)
        plt.show()

In [None]:
# Categorical Variables
cat_features = [features for features in train.columns if train[features].dtypes == 'O']
print('number of categorical variables:', len(cat_features))
train[cat_features].head()

In [None]:
for feature in cat_features:
    print('{} has {} unique categories'.format(feature,train[feature].nunique()))

In [None]:
# relationship between categorical variable and dependent feature SalesPrice
for feature in cat_features:
    data = train.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('median of Sale price')
    plt.show()

In [None]:
# lets handle Categorical features which are missing
features_cat_nan = [features for features in train.columns if train[features].isnull().sum()>1 and train[features].dtypes=='O']

for features in features_cat_nan:
    print('{}: {} % of missing values'.format(features, np.round(train[features].isnull().mean(),3)))

In [None]:
## Replace missing value with a new label
train[features_cat_nan] = train[features_cat_nan].fillna('Missing')
train[features_cat_nan].isnull().sum()

In [None]:
# check for numerical variables the contains missing values
features_nue_nan = [features for features in train.columns if train[features].isna().sum()>0 and train[features].dtypes != 'O']

for features in features_nue_nan:
    print('{}: {} % of missing values'.format(features, np.round(train[features].isnull().mean(),3)))

In [None]:
# Replacing the numerical Missing Values and create a new feature to capture nan values
for features in features_nue_nan:
    train[features+'Nan'] = np.where(train[features].isna(),1,0)
    train[features].fillna(train[features].median(), inplace=True)

train[features_nue_nan].isna().sum()

In [None]:
print(train.shape)
train.head()

In [None]:
## Temporal Variables (Date Time Variables)

temporal_features = [feature for feature in train.columns if 'Yr' in feature or 'Year' in feature]
temporal_features

In [None]:
temporal_features = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']

for feature in temporal_features:
    train[feature] = train['YrSold']-train[feature]

train[temporal_features].head()

In [None]:
# Since the numerical variables are skewed we will perform log normal distribution
import numpy as np
num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in num_features:
    train[feature]=np.log(train[feature])

In [None]:
#Handling Rare Categorical Feature
#We will remove categorical variables that are present less than 1% of the observations

categorical_features=[feature for feature in train.columns if train[feature].dtype=='O']
print(categorical_features)

In [None]:
for feature in categorical_features:
    temp=train.groupby(feature)['SalePrice'].count()/len(train)
    temp_df=temp[temp>0.01].index
    train[feature]=np.where(train[feature].isin(temp_df),train[feature],'Rare_var')

In [None]:
train.groupby('LotShape')['SalePrice'].count()/len(train)


In [None]:
train.head()