In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import neighbors
from math import sqrt

In [2]:
df1 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df1

In [3]:
valid = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
valid

In [4]:
valid.info()

In [5]:
df1.head()

In [6]:
df1.describe()

In [7]:
df1.info()

In [8]:
df1.isnull().sum()

In [9]:
df1.isnull().mean()

In [10]:
def missing (df1):
    missing_number = df1.isnull().sum().sort_values(ascending=False)
    missing_percent = ((df1.isnull().sum()/df1.isnull().count())*100).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

In [11]:
missing(df1)

In [13]:
for col in df1.columns:
    if df1[col].isnull().mean()*100>40:
        df1.drop(col,axis=1,inplace=True)

In [14]:
df1

In [18]:
df1.columns

In [19]:
sns.countplot(df1.dtypes.map(str))
plt.show()

In [20]:
df1.dtypes.value_counts()

In [21]:
f = lambda x: x.median() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
df1 = df1.fillna(df1.groupby('YrSold').transform(f))
df1

In [22]:
for col in df1.columns:
    if df1[col].dtypes != object:
        q1 = df1[col].quantile(0.25)
        q2 = df1[col].quantile(0.50)
        q3 = df1[col].quantile(0.75)
        IQR = q3 - q1
        llp = q1-1.5*IQR
        ulp = q3+1.5*IQR
        print('column name',col)
        print('q1',q1)
        print('q2',q2)
        print('q3',q3)
        print('IQR',IQR)
        print('llp',llp)
        print('ulp',ulp)
        print('mean:',df1[col].mean())
        print('median:',df1[col].median())
        print('mode',df1[col].mode()[0])
        print('skewness:',df1[col].skew())
        print('kurtosis:',df1[col].kurtosis())
        print('std',df1[col].std())
        print('max',df1[col].max())
        print('min',df1[col].min())
        print('null_value count:',df1[col].isnull().sum())
        print('\n')

In [23]:
df1.dtypes

In [24]:
df1['MSZoning'].unique()

In [25]:
df1['RoofMatl'].unique()

In [26]:
Q1 = df1.quantile(0.25)
Q3 = df1.quantile(0.75)
IQR = Q3 - Q1
print('outliers count of each columns')
((df1 < (Q1 - 1.5 * IQR)) | (df1 > (Q3 + 1.5 * IQR))).sum()

In [27]:
count=1
plt.subplots(figsize=(30,25))
for i in df1.columns:
    if df1[i].dtypes!='object':
        plt.subplot(6,7,count)
        sns.distplot(df1[i])
        count+=1

plt.show()

In [28]:
count=1
plt.subplots(figsize=(30,25))
for i in df1.columns:
    if df1[i].dtypes!='object':
        plt.subplot(6,7,count)
        sns.boxplot(df1[i])
        count+=1

plt.show()

In [29]:
df1.dtypes

In [31]:
#pip install autoviz
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
df_av = AV.AutoViz('../input/house-prices-advanced-regression-techniques/train.csv')

In [32]:
le=LabelEncoder()
for col in df1.columns:
    if df1[col].dtypes == object:
        df1[col]= le.fit_transform(df1[col])

In [33]:
X=df1.drop('SalePrice',axis=1)
y=df1['SalePrice']

In [34]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [35]:
def train_models(X_train, y_train):
    
   
    tree = DecisionTreeRegressor(max_features=75,max_depth=4, random_state = 0)
    tree.fit(X_train, y_train)
    y_pred_tree = tree.predict(X_test)

    rf = RandomForestRegressor(n_estimators = 100,max_features =75, random_state = 0)
    rf.fit(X_train, y_train)
    y_pred_rf= rf.predict(X_test)
    
    svr= SVR(kernel = 'rbf')
    svr.fit(X_train, y_train)
    y_pred_svr = svr.predict(X_test)
    
    svr_l= SVR(kernel = 'linear')
    svr_l.fit(X_train, y_train)
    y_pred_svr_linear = svr_l.predict(X_test)
    
    knn = neighbors.KNeighborsRegressor()
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    
    meanAbErr_tree= metrics.mean_absolute_error(y_test, y_pred_tree)
    meanSqErr_tree= metrics.mean_squared_error(y_test, y_pred_tree)
    rootMeanSqErr_tree= np.sqrt(metrics.mean_squared_error(y_test, y_pred_tree))
    
    meanAbErr_rf= metrics.mean_absolute_error(y_test, y_pred_rf)
    meanSqErr_rf= metrics.mean_squared_error(y_test, y_pred_rf)
    rootMeanSqErr_rf= np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf))
  
    meanAbErr_knn = metrics.mean_absolute_error(y_test, y_pred_knn)
    meanSqErr_knn = metrics.mean_squared_error(y_test, y_pred_knn)
    rootMeanSqErr_knn= np.sqrt(metrics.mean_squared_error(y_test, y_pred_knn)) 

    meanAbErr_svr = metrics.mean_absolute_error(y_test, y_pred_svr_linear)
    meanSqErr_svr = metrics.mean_squared_error(y_test, y_pred_svr_linear)
    rootMeanSqErr_svr= np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr_linear)) 

    print('[1]Decision Tree Training Accurancy: ', r2_score(y_test,y_pred_tree))
    print('Mean Absolute Error:', meanAbErr_tree)
    print('Mean Square Error:', meanSqErr_tree)
    print('Root Mean Square Error:', rootMeanSqErr_tree)
    print('\t')
    print('[2]RandomForestRegressor Training Accurancy: ',r2_score(y_test,y_pred_rf))
    print('Mean Absolute Error:', meanAbErr_rf)
    print('Mean Square Error:', meanSqErr_rf)
    print('Root Mean Square Error:', rootMeanSqErr_rf)
    print('\t')    
    print('[3]SupportvectorRegression Accuracy(rbf): ', r2_score(y_test,y_pred_svr))
    print('\t')
    print('[4]SupportvectorRegression Accuracy(linear): ', r2_score(y_test,y_pred_svr_linear))
    print('Mean Absolute Error:', meanAbErr_svr)
    print('Mean Square Error:', meanSqErr_svr)
    print('Root Mean Square Error:', rootMeanSqErr_svr)
    print('\t')
    print('[5]knn Training Accurancy: ', r2_score(y_test,y_pred_knn))
    print('Mean Absolute Error:', meanAbErr_knn)
    print('Mean Square Error:', meanSqErr_knn)
    print('Root Mean Square Error:', rootMeanSqErr_knn)
    print('\t')

In [36]:
train_models(X_train, y_train)

In [37]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()  
mlr.fit(X_train, y_train)
y_pred_mlr= mlr.predict(X_test)
y_pred_mlr

In [38]:
r2_mlr =r2_score(y_test,y_pred_mlr)
print('r2_score:',r2_mlr*100)

In [39]:
valid

In [40]:
missing(valid)

In [41]:
for col in valid.columns:
    if valid[col].isnull().mean()*100>40:
        valid.drop(col,axis=1,inplace=True)

In [42]:
valid

In [43]:
f = lambda x: x.median() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
valid = valid.fillna(valid.groupby('YrSold').transform(f))
valid

In [44]:
valid.columns

In [47]:
le=LabelEncoder()
for col in valid.columns:
    if valid[col].dtypes == 'object':
        valid[col]= le.fit_transform(valid[col])

In [48]:
valid['MSZoning'].value_counts()

In [49]:
valid

In [50]:
y_valid = mlr.predict(valid)

In [51]:
y_valid

In [52]:
output = pd.DataFrame({"Id": valid['Id'],"SalePrice": y_valid})
output

In [53]:
output.to_csv("submission1.csv", index=False)
output.head(10)