# This is the initial EDA of the dataset
## Note: This notebook is still under development, I have to still add feature engg and build models

## Please like if you found it helpfull, and feel free to suggest any changes

# Imports and reading data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install pandas-flavor

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



## Create custom methods for pandas dataframe for ease of use

In [None]:
from pandas_flavor import register_dataframe_method

@register_dataframe_method
def missing(df):
        return sorted(
                    [(col,str(df[col].dtypes),np.round(df[col].isna().sum()/len(df) * 100,2)) for col in df.columns if df[col].isna().sum() !=0 ],
                    key = lambda x: x[2], reverse=True)
    
@register_dataframe_method
def get_numeric_df(df):
    return df.select_dtypes(np.number)

@register_dataframe_method
def get_numeric_col_names(df):
    return list(df.select_dtypes(np.number).columns)

@register_dataframe_method
def discrete_features(df,thresold):
#     thresold in number of unique values
    return [feature for feature in df.columns if len(df[feature].unique()) < thresold]

@register_dataframe_method
def continious_features(df,thresold):
    #     thresold in number of unique values
    return [feature for feature in df.columns if len(df[feature].unique()) >= thresold]

@register_dataframe_method
def about(df):
    print('shape of data')
    print(df.shape)
    print("=="*50)
    
    print(f"Datatypes in data\n{df.dtypes.value_counts(ascending=False)}")
    print("=="*50)
    
    
    print("Percentage of missing values")
    print(*missing(df),sep="\n")
    print("=="*50)

    print('data description \n')
    display(df.describe().T)
    print("=="*50)

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

df.drop(['Id'],axis=1,inplace=True)
# df.head()

In [None]:
df.about()

# lets drop features with greater missing values

In [None]:
missing_val = df.missing()
missing_val

In [None]:
drop_me = [i[0] for i in missing_val if i[2]>15]
drop_me

In [None]:
df.drop(drop_me,axis=1,inplace=True)
df.head()

In [None]:
discrete = df.discrete_features(thresold=50)
print(discrete)

In [None]:
len(discrete)

In [None]:
continious = df.continious_features(thresold=50)
continious

In [None]:
len(continious)

In [None]:
df["YrSold"].unique()

# analyse continious features
**initial guesses of which features matter**

**P = proportional**
**IP = Inversely proportional**
**SP = SalePrice**

- **LotArea:** Lot size in sqft. probably the area of the house. *P to SP*. may also depond on location of the plot
- **YearBuilt** year of construction. *ideally IP to SP* some old mansions may be sold at heigher price
- **YearRemodAdd** Remodel Date. *P to SP* adds extra cost
- **TotalBsmtSF** *P to SP*
- **1stFlrSF** and **2ndFlrSF** area in sqft of 1st and 2nd floor *P to SP*


In [None]:
sns.displot(df['SalePrice'],kde=True,aspect=3)

**log transform to reduce skew**

In [None]:
df['SalePrice'] = np.log(df['SalePrice'])
sns.displot(df['SalePrice'],kde=True,aspect=3)

In [None]:
# Plotting numerical features with polynomial order to detect outliers.

# https://www.kaggle.com/datafan07/beginner-eda-with-feature-eng-and-blending-models/notebook

from matplotlib.ticker import MaxNLocator

def srt_reg(y, df,x_size=30,y_size=30):
    
    ncols = 3
    nrows = int(np.ceil(df.shape[1]/ncols))
    
    fig, axes = plt.subplots(nrows, ncols, figsize=(x_size,y_size))
    axes = axes.flatten()

    for i, j in zip(df.columns, axes):

        sns.regplot(x=i,
                    y=y,
                    data=df,
                    ax=j,
                    order=3,
                    ci=None,
                    color='#e74c3c',
                    line_kws={'color': 'black'},
                    scatter_kws={'alpha':0.4})
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=10))

        plt.tight_layout()

In [None]:

def srt_box(y, df):
    fig, axes = plt.subplots(19, 3, figsize=(30,30))
    axes = axes.flatten()

    for i, j in zip(df.columns, axes):

        sortd = df.groupby([i])[y].median().sort_values(ascending=False)
        sns.boxplot(x=i,
                    y=y,
                    data=df,
                    palette='plasma',
                    order=sortd.index,
                    ax=j)
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()



In [None]:
srt_reg('SalePrice',df[continious])

## apply standard scaling on certain Continious features

In [None]:
df_tmp = df.copy()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
cols = ["LotArea","MasVnrArea","BsmtFinSF1","TotalBsmtSF","1stFlrSF","2ndFlrSF","GrLivArea"]


scaler = StandardScaler()

# apply standardization on numerical features
for i in cols:
    
    df_tmp[i] = scaler.fit_transform(df_tmp[[i]])

In [None]:
srt_reg('SalePrice',df_tmp[cols + ["SalePrice"]],x_size=30,y_size=15)

## Observing the trend

- GrLivArea
- LotArea
- BsmtFubSF1
- TotalBsmtSF
- 1stFlrSF
- 2ndFlrSF
- GrLivArea

have clear trends, which is as guessed before

## lets handle year/ temporal features

It makes sense to look at **year sold - year build** and **year sold - year remod**

In [None]:
year_feature = [feature for feature in continious if 'Yr' in feature or 'Year' in feature]

print(year_feature)


df["House_Age"] = df["YrSold"] - df["YearBuilt"]


df["House_Remod_Age"] = df["YrSold"] - df["YearRemodAdd"]

df["House_Garage_Age"] = df["YrSold"] - df["GarageYrBlt"]

# drop the old temporal data
drop_temporal = ["YrSold","YearBuilt","YearRemodAdd","GarageYrBlt","MoSold"]
df.drop(drop_temporal,inplace=True,axis=1)


In [None]:
features = ["House_Age", "House_Remod_Age","House_Garage_Age"]
srt_reg("SalePrice", df[features + ["SalePrice"]],x_size=40,y_size=15)

## Temporal features conclusion
The trends are exactly as anticipated

## Exploring discrete features

In [None]:
for i in drop_temporal:

    try:
        discrete.remove(i)
    except ValueError:
        pass
print(discrete)




### distrubution of discrete features

In [None]:
def histplot(df,x_size=30,y_size=60):
    
    ncols = 3
    nrows = int(np.ceil(df.shape[1]/ncols))
    
    fig, axes = plt.subplots(nrows, ncols, 
                             figsize=(x_size,y_size)
                            )
    axes = axes.flatten()

    for i, j in zip(df.columns, axes):

        sns.histplot(df[i],ax=j)
        j.tick_params(labelrotation=45)
#         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()
    
def bar_with_variable(df,y,x_size=30,y_size=30):
    ncols = 3
    nrows = int(np.ceil(df.shape[1]/ncols))
    
    fig, axes = plt.subplots(nrows, ncols, 
                             figsize=(x_size,y_size)
                            )
    axes = axes.flatten()

    for i, j in zip(df.columns, axes):
        
        if i == y:
            continue
#         tmp = pd.DataFrame(df.groupby(i)[y].median()).reset_index(inplace=True)
        sns.barplot(data=pd.DataFrame(df.groupby(i)[y].median()).reset_index(),
                    x=i,
                    y=y,
                    ax=j)
        j.tick_params(labelrotation=45)
#         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()
    
def violin(df,y,x_size=30,y_size=30):
    ncols = 3
    nrows = int(np.ceil(df.shape[1]/ncols))
    
    fig, axes = plt.subplots(nrows, ncols, 
                             figsize=(x_size,y_size)
                            )
    axes = axes.flatten()

    for i, j in zip(df.columns, axes):
        
        if i == y:
            continue

        sns.violinplot(data=df,
                    x=i,
                    y=y,
                    ax=j)
        
#         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()
    
    


In [None]:
violin(df[discrete+["SalePrice"]],y="SalePrice",y_size=90)

In [None]:
histplot(df[discrete],y_size=100)

# Closing Taughts
Looking at the graphs, the features which look important are
(note we need to verify these with feature selection)


**Continious Features**

- LotArea (needs outlier handeling)
- BsmtFinSF1
- TotalBsmtSF
- 1stFlrSF
- 2ndFlrSF
- GrLivArea
- GarageYear
- House_Age
- House_Remod_Age
- House_Garage_Age

**Discrete Features**
(some features distributions are not uniform, hence need to reduce cardinality)
- MSSubClass(sine looking pattern)
- HouseStyle
- OverallQual
- TotRmsAbvGrd
- Fireplaces(maybe?)
- GarageCars


