In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [115]:
data = pd.read_csv('C:/Users/PrathameshKulkarni/OneDrive - TheMathCompany Private Limited/train.csv')

# Pre-Processing

In [116]:
"""Preprocessing stuff"""

#Checking nulls
#print(data.shape)
#print(data.isnull().sum())

def preprocess(data,dataset):

    #Dropping columns where >90% values are null and are not super important
    data.drop(columns = ['Id','Alley','PoolQC','Fence','MiscFeature'],inplace= True)

    #Filling categorical variables with mode values
    list_cat = ['MasVnrType','MasVnrArea','Electrical','FireplaceQu','BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'BsmtFinSF2','GarageQual','GarageCond','GarageType','GarageYrBlt', 'GarageFinish']

    data[list_cat] = data[list_cat].fillna(data[list_cat].mode().iloc[0])

    #Filling numerical variables with mean value

    data['LotFrontage'] = data['LotFrontage'].fillna(data['LotFrontage'].mean())


    #Treating outliers with mean

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    numeric_data = data.select_dtypes(include=numerics)

    num_cols = numeric_data.columns

    for col in num_cols:
        Q1 = np.percentile(data[col],25)
        Q3 = np.percentile(data[col],75)

        iqr = Q3-Q1

        upper = np.where((data[col] > Q3+1.5*iqr))
        lower = np.where((data[col] < Q1-1.5*iqr))

        data[col] = np.where(data[col].isin(upper[0]),np.nan,data[col])

    for col in num_cols:
        data[col] = data[col].fillna(data[col].mean())

    numerical_data = data[[cols for cols in data.columns if cols in numeric_data]]    
    category_data = data[[cols for cols in data.columns if cols not in numeric_data]]

    
    #Filling categorical variables with mode values
    list_cat = category_data.columns

    category_data[list_cat] = category_data[list_cat].fillna(category_data[list_cat].mode().iloc[0])
   
    
    
    from sklearn.preprocessing import OneHotEncoder

    data_encode_train = OneHotEncoder()
    data_encoded_arr = pd.DataFrame(data_encode_train.fit_transform(category_data).toarray())

    data_encoded_arr.columns = data_encode_train.get_feature_names(category_data.columns)


    # # Save this onehot encoding object for reuse purpose
    # with open('data_encoded_arr.pkl', 'wb') as f:
    #     pickle.dump(data_encode, f)

    # # Load it before using it & same thing we do we when use in production/testing environment
    # with open('data_encoded_arr.pkl', 'rb') as f:
    #     data_encode = pickle.load(f)

    #combine with numerical data
    data_final = pd.concat([numerical_data,data_encoded_arr],axis= 1)
    data = data_final.copy(deep = True)
    


    data_scaling = data[[cols for cols in data.columns if cols not in 'SalePrice']]
    
    from sklearn.preprocessing import MinMaxScaler

    # define min max scaler
    scaler = MinMaxScaler()

    # transform data
    scaled = scaler.fit_transform(data_scaling)
    scaled = pd.DataFrame(scaled)
    scaled.columns = data_scaling.columns
    

    # Applying PCA

    from sklearn.decomposition import PCA
    data_indep = scaled.copy(deep = True)

    # Set the n_components=3
    principal=PCA(n_components=100)
    principal.fit(data_indep)
    principal_df =principal.transform(data_indep)
    print(principal.explained_variance_ratio_.cumsum()[-1])
    
    principal_df = pd.DataFrame(principal_df)
    
    if dataset == 'Train':
        if principal_df.shape[0] == scaled.shape[0]:

            final_df = pd.concat([principal_df,data[['SalePrice']]],axis = 1)

        else:
            print('check PCA, some error')

    else:
        final_df = principal_df
    return final_df
    

In [117]:
scaled = preprocess(data,'Train')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


[0.15690834 0.21814093 0.25941794 0.29264924 0.32334994 0.35071544
 0.37533484 0.39784779 0.41927991 0.43993659 0.45973158 0.47867403
 0.49643168 0.51339581 0.52942555 0.544966   0.56015105 0.57460738
 0.58821061 0.60125279 0.61358541 0.62536703 0.63686015 0.64754814
 0.6581747  0.66858151 0.67837776 0.68780509 0.69680134 0.70544115
 0.71385893 0.72200616 0.72969135 0.73736281 0.74473274 0.75183048
 0.75886297 0.76547765 0.77186424 0.77801497 0.78410739 0.78980319
 0.79541835 0.80095907 0.80636466 0.81148098 0.81645327 0.82126016
 0.82590699 0.8304347  0.83466218 0.83877208 0.84278167 0.84669477
 0.85050084 0.85418904 0.85784191 0.86125528 0.86459231 0.8678538
 0.8709781  0.87401155 0.87701632 0.87989756 0.8827178  0.88547975
 0.88820254 0.89083802 0.8934388  0.89599254 0.89847804 0.9008646
 0.9031923  0.90546436 0.9076869  0.90989805 0.91207684 0.91424229
 0.91633376 0.91838252 0.92040975 0.92235622 0.92423154 0.92606842
 0.92785138 0.92962672 0.93134297 0.93302657 0.93468926 0.936345

In [64]:
scaled.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,SalePrice
0,2.190748,0.38116,-0.382495,1.081318,-0.508007,-0.584043,-0.526311,0.260526,0.38875,-0.350667,0.582016,-0.249304,0.074309,-0.227947,-0.427271,-0.244957,-0.318487,-0.198228,-0.504521,0.087609,0.360355,0.021709,-0.045506,-0.016161,0.110294,-0.02535,0.359878,0.160695,0.039357,0.237135,0.088392,-0.201674,-0.007592,0.016398,0.143908,-0.14432,0.222592,-0.101013,-0.045645,0.223296,-0.222749,-0.181719,-0.004601,0.043013,-0.139323,-0.00582,0.133511,-0.339842,0.009342,-0.092323,0.178721,0.192431,0.072866,0.244992,0.012132,208500.0
1,-0.298165,-1.029539,-0.227701,0.116329,-0.714163,0.329115,-0.156688,-0.246096,0.011037,0.953668,-0.412021,1.001506,1.193924,0.541961,-0.695512,0.801086,0.159445,0.435364,0.167911,-0.409328,0.020568,0.066166,-0.463199,0.377534,0.337566,-0.858031,-0.030205,-0.371798,-0.81509,0.111603,-0.425042,0.054644,-0.024929,-0.389991,0.329174,-0.264401,-0.543398,-0.571253,0.060521,0.207947,0.083378,-0.200971,0.530308,-0.070066,-0.041507,0.072365,-0.378043,0.0592,0.368633,-0.000902,-0.063293,-0.070507,-0.016512,-0.022562,0.291189,181500.0


In [128]:
from sklearn.model_selection import train_test_split

features = scaled[[cols for cols in scaled.columns if cols not in ['SalePrice']]]
X_train, X_test, y_train, y_test = train_test_split(
features, scaled['SalePrice'], train_size=0.8, random_state=42
)



In [135]:
# Fitting Random Forest Regression to the dataset
# import the regressor
from sklearn.ensemble import RandomForestRegressor

# create regressor object
regressor = RandomForestRegressor(n_estimators = 190, random_state = 0,max_features = 'auto',max_depth= 8,bootstrap = True)

# fit the regressor with x and y data
regressor.fit(X_train, y_train)

Y_pred = regressor.predict(X_test) # test the output by changing values


In [136]:
from sklearn import metrics
# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, Y_pred)))

Root Mean Squared Error: 44303.09766659936


In [134]:
from sklearn.model_selection import GridSearchCV
from numpy import arange

# define grid
grid = dict()
grid['n_estimators'] = arange(10, 200, 20)

param_grid = {'bootstrap': [True], 'max_depth': [4,5,6,7,8], 'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': grid['n_estimators']}

search = GridSearchCV(regressor, param_grid,cv=2, scoring='neg_mean_absolute_error', n_jobs=-1)

# perform the search
results = search.fit(X_train, y_train)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -28601.096
Config: {'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 190}


In [119]:
import xgboost as xg
from sklearn.metrics import mean_squared_error as MSE

xgb_r = xg.XGBRegressor(learning_rate = 0.1, max_depth = 4, n_estimators = 190, objective = 'reg:squarederror', seed = 123)

# Fitting the model
xgb_r.fit(X_train, y_train)


# Predict the model
pred = xgb_r.predict(X_test)



# RMSE Computation
rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : % f" %(rmse))

RMSE :  38569.529651


In [111]:
# define grid
grid = dict()
grid['n_estimators'] = arange(5, 300, 20)

parameters = {
    
    'objective':['reg:squarederror','reg:logistic','binary:logistic','reg:pseudohubererror','binary:hinge','reg:tweedie','rank:map'],
    'max_depth': range (2, 10, 1),
    'n_estimators': range(150, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
search = GridSearchCV(xgb_r, parameters, scoring='neg_mean_absolute_error', n_jobs=-1)

# perform the search
results = search.fit(X_train, y_train)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -24358.975
Config: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 190, 'objective': 'reg:squarederror'}


# Test

In [67]:
X_train.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
254,-1.585493,-0.752397,0.667502,0.488535,-0.775137,-0.227187,-0.313622,0.619504,0.077331,1.234571,0.262116,0.170349,0.710186,-0.386299,-0.419448,0.158578,0.174189,-0.219738,-0.408607,-0.053736,0.208486,0.09074,1.191224,-0.341389,0.450517,-2e-06,-0.378267,0.210166,0.327298,-0.185363,-0.541703,-0.119071,-0.258968,0.072579,0.230226,0.261815,0.024111,0.243678,-0.260415,-0.127548,-0.205794,0.163178,0.17011,-0.115866,-0.270906,0.200525,0.02511,0.247247,0.298899,-0.096887,-0.002944,-0.134183,0.04148,0.009181,-0.087135


In [71]:
test_df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
0,-1.429829,-0.409193,0.131162,-1.240364,0.133155,-0.126554,0.453674,-0.614384,-0.668151,-0.540065,-0.171861,-0.258724,-0.060287,-0.503802,-0.26259,-0.167068,0.125541,0.488339,0.114787,-0.2082,-0.834189,-0.388603,0.424569,1.185537,0.043624,0.465912,-0.673355,-0.102133,0.329083,0.244695,0.026167,-0.108143,-0.103936,0.105511,-0.040516,0.065657,0.000472,-0.318401,-0.088518,-0.150075,-0.671612,-0.172065,-0.489823,-0.04665,-0.231432,0.06357,0.093831,0.057888,-0.310119,-0.221433,0.102581,-0.154561,-0.084714,-0.118634,-0.0959


In [120]:
data_test = pd.read_csv('C:/Users/PrathameshKulkarni/OneDrive - TheMathCompany Private Limited/test.csv')

data_test_final = data_test.copy(deep = True)

test_df = preprocess(data_test_final,'Test')

test_df.columns = X_train.columns

Y_pred = xgb_r.predict(test_df)

# Y_pred.shape
submission = pd.read_csv('C:/Users/PrathameshKulkarni/OneDrive - TheMathCompany Private Limited/sample_submission.csv')
submission_final = pd.concat([submission[['Id']],pd.DataFrame(Y_pred)],axis = 1)
submission_final.rename(columns= {0:'SalePrice'},inplace = True)
submission_final.head(2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


[0.15906678 0.22322427 0.26356145 0.29623505 0.32647292 0.35395016
 0.37851176 0.40171153 0.42251574 0.44290417 0.46255821 0.48085355
 0.49871765 0.51576706 0.5312167  0.54649359 0.56160341 0.57653202
 0.59069529 0.60416978 0.61667226 0.62864794 0.64035103 0.65118801
 0.6619229  0.67186963 0.68143281 0.69076786 0.6997652  0.70837556
 0.71673373 0.72472737 0.73266377 0.74024056 0.74740772 0.75449357
 0.76129254 0.76791642 0.77441454 0.78065737 0.78671725 0.79268788
 0.79844063 0.80410365 0.80958628 0.81459441 0.81948866 0.82434264
 0.82905026 0.83353542 0.83784297 0.84200891 0.84597191 0.84989251
 0.85370777 0.85745592 0.86112744 0.86465621 0.8681217  0.8714446
 0.87470837 0.8779384  0.88101492 0.88403578 0.88696535 0.88985332
 0.89261579 0.89529007 0.897921   0.90050012 0.9030282  0.90547515
 0.90789312 0.91020654 0.91243163 0.91463943 0.91679635 0.91886387
 0.92089238 0.92288226 0.92484575 0.92678247 0.92866973 0.93051991
 0.93230505 0.93404629 0.93577507 0.93748436 0.9390684  0.94064

Unnamed: 0,Id,SalePrice
0,1461,115332.882812
1,1462,166458.75


In [121]:
submission_final.to_csv('D:/Kaggle submissions/Submission_house_prediction_v2.csv',index= False)

In [None]:
data_test = pd.read_csv('C:/Users/PrathameshKulkarni/OneDrive - TheMathCompany Private Limited/test.csv')

data_test.head(2)

In [None]:
# import lazypredict
# from lazypredict.Supervised import LazyRegressor

# reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
# models, predictions = reg.fit(X_train, X_test, y_train, y_test)

# #print(models)


In [None]:
# models.sort_values(by = 'RMSE')