In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from sklearn import feature_selection
import warnings
warnings.filterwarnings('ignore')
SEED = 42

In [None]:
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

**Data Visualization**

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.info()

In [None]:
dfs = [train_data, test_data]
for df in dfs:
    temp = df.isnull().sum()
    print(temp.loc[temp!=0], '\n')

In [None]:
test_data.info()

In [None]:
train_data['LT_Salesprice']=np.log1p(train_data['SalePrice'])
plt.hist(train_data['LT_Salesprice'],color = 'black')
plt.show()
train_data['LT_Salesprice'].skew()

In [None]:
# Lets explore the correlations in our data set 
plt.figure(figsize=(20,20))
sns.heatmap(train_data.corr())
plt.show()

**Data Cleaning**

In [None]:
def data_cleaning(df):
    


    #Handling Null Values
    df['MSZoning'].fillna(value = df['MSZoning'].mode()[0],inplace=True)
    df.drop(['Alley','FireplaceQu','PoolQC','MiscFeature','Fence'], axis = 'columns',inplace = True)
    df['LotFrontage'].fillna(df['LotFrontage'].dropna().mean(),inplace = True)
    
    for Bsmt in ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                 'BsmtFinType2','BsmtFinSF1','BsmtUnfSF','BsmtFinSF2','BsmtFullBath','BsmtHalfBath','TotalBsmtSF'] :
        df[Bsmt].fillna((df[Bsmt].mode()[0]),inplace=True)
       
    for garage in ['GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond','GarageCars','GarageArea']:
        df[garage].fillna((df[garage].mode()[0]),inplace=True)   
    
    for other in ['SaleType','Functional','KitchenQual',
                  'Electrical','MasVnrType','Exterior1st','Exterior2nd','Utilities','MasVnrArea']:
        df[other].fillna((df[other].mode()[0]),inplace=True)  
    
    # print(df.isnull().sum())
    
    
    # List of numerical variables
    numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
    print('Number of numerical variables: ', len(numerical_features))
    
    # Visualise the numerical variables
    df[numerical_features].head()
    
    #Some Features aren't numerical as well as categorical.So we need to make few changes in it.
    year_feature = [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature]
    year_feature
    df.groupby('YrSold')['SalePrice'].median().plot()
    plt.show()
    
    #Numerical features are of two types - Discrete & Continuos
    discrete_feature = [feature for feature in numerical_features if len(df[feature].unique())<25 and feature not in year_feature+['id']]
      
    continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature+year_feature+['Id']]
    
    for feature in continuous_feature :
        data = df.copy()
        if 0 in data[feature].unique() :
            pass
        else:
            data[feature] = np.log(data[feature])
            data['SalePrice'] = np.log(data['SalePrice'])
            plt.scatter(data[feature],data['SalePrice'])
            plt.xlabel(feature)
            plt.ylabel('Salesprice')
            plt.show()
            
    #Outliers
    #If u have lots of outliers replace nan with mode or median
    for feature in continuous_feature :
        data = df.copy()
        if 0 in data[feature].unique() :
            pass
        else:
            data[feature] = np.log(data[feature])
            data.boxplot(column=feature)
            plt.ylabel(feature)
            plt.title(feature)
            plt.show()
            
    #Changing the years column to numerical data        
    for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
       
        df[feature]=df['YrSold']-df[feature]
        
    #Categorical Features
    categorical_features=[feature for feature in df.columns if df[feature].dtypes=='O']
    len(categorical_features)  
    for feature in categorical_features:
        temp=df.groupby(feature)['SalePrice'].count()/len(df)
        temp_df=temp[temp>0.01].index
        df[feature]=np.where(df[feature].isin(temp_df),df[feature],'Rare_var')
    df.shape    
    
    for features in categorical_features:
        dummies = pd.get_dummies(df[features])
        merged = pd.concat([df,dummies],axis = 'columns')
        df = merged.copy()
    
    for feature in categorical_features:
        df.drop(feature, axis = 'columns',inplace = True)
        
    df.drop('LT_Salesprice', axis = 'columns',inplace = True) 
    return df

**Merging Train & Test Data**

In [None]:
Dataset = pd.concat([train_data,test_data])
clean_data = data_cleaning(Dataset)

**Splitting the Merged data into train and test data as before**

In [None]:
clean_test = clean_data.iloc[1460:,:]
clean_test.to_csv('CleanTestData.csv',index = False)

In [None]:
clean_train = clean_data.iloc[:1460,:]
clean_train.to_csv('CleanTrainData.csv',index = False)

In [None]:
X_train = clean_train.drop('SalePrice',axis = 'columns')
y_train = clean_train.SalePrice
X_test = clean_test.drop('SalePrice',axis = 'columns')

**RandomForestRegressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 25)
# scaler.inverse_transform(X_test)
model.fit(X_train,y_train)
y_test = model.predict(X_test)
y_test

**Exporting Predicted Values**

In [None]:
submission = pd.DataFrame(columns=['Id', 'SalePrice'])
submission['Id'] = X_test['Id']
submission['SalePrice'] = y_test

submission.to_csv('MySubmission.csv', index=False)
print("submission succesfull")