In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,PolynomialFeatures,StandardScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import mean_squared_error
from math import sqrt

In [11]:
df = pd.read_csv('development.csv',skipinitialspace=True)
df_evaluation = pd.read_csv('evaluation.csv',skipinitialspace=True)

In [12]:
df.isna().sum()

id                                   0
name                                13
host_id                              0
host_name                           19
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       8041
reviews_per_month                 8041
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In [13]:
df_evaluation.isna().sum()

id                                   0
host_id                              0
host_name                           31
neighbourhood                     3415
neighbourhood_group_cleansed      9987
latitude                             0
longitude                            0
room_type                            0
minimum_nights                       0
availability_365                     0
number_of_reviews                    0
calculated_host_listings_count       0
reviews_per_month                 1275
last_review                       1275
dtype: int64

In [14]:
column = 'price'
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
mask=( (df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)) )
df = df[~mask]

In [15]:
df['reviews_per_month'].fillna(0,inplace=True)
df_evaluation['reviews_per_month'].fillna(0,inplace=True)
drop_columns = ['latitude','longitude','id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'price','last_review']
drop_columns_evaluation = ['latitude','longitude','id', 'host_id', 'host_name', 'neighbourhood',
       'neighbourhood_group_cleansed',  'last_review']
X = df.drop(drop_columns, axis=1)
X_evaluation=df_evaluation.drop(drop_columns_evaluation,axis=1)
y = df['price']

In [16]:
ss = StandardScaler()
le = LabelEncoder()
# Transform all categorical columns in Dependant Varaible X
for i in X.columns:
    if X[i].dtype not in ['int64','float64'] :
        le.fit(X[i].append(X_evaluation[i]))
        X[[i]]=le.transform(X[[i]])
        X_evaluation[[i]] = le.transform(X_evaluation[[i]])
    else:
        X[[i]]=ss.fit_transform(X[[i]])
        X_evaluation[[i]] = ss.fit_transform(X_evaluation[[i]])

X = X.values
X_evaluation = X_evaluation.values
y = y.values.reshape(-1,1)


In [17]:
X.shape,y.shape,X_evaluation.shape

((36767, 6), (36767, 1), (9987, 6))

In [18]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=40)

In [19]:
# Fit and evaluate linear Model
lr = LinearRegression()
lr.fit(X_train, y_train)

train_pred= lr.predict(X_train)

print("X_train")
print("RMSE:",np.sqrt(mean_squared_error(y_train,train_pred)))
print("r2 score:",r2_score(y_train, train_pred))

test_pred= lr.predict(X_test)
print("X_test")
print("RMSE:",np.sqrt(mean_squared_error(y_test,test_pred)))
print("r2 score:",r2_score(y_test, test_pred))

X_train
RMSE: 53.09433299045722
r2 score: 0.4019065035591147
X_test
RMSE: 52.844063803963515
r2 score: 0.3957917436130044


In [33]:
evaluate_pred_final= lr.predict(X_evaluation)
evaluate_pred_final=np.squeeze(evaluate_pred_final)
pd.DataFrame({'id': df_evaluation.id, 'price': evaluate_pred_final}).to_csv('submission0.csv', index=False)

(9987,)


In [35]:
# Fit best polynomial Linear model
best_poly_reg = PolynomialFeatures(degree=2)
X_poly = best_poly_reg.fit_transform(X_train)
best_pol_reg = LinearRegression()
best_pol_reg.fit(X_poly, y_train)

train_pred= best_pol_reg.predict(X_poly)
print("X_train")
print("RMSE :",np.sqrt(mean_squared_error(y_train,train_pred)))
print("R2 score :",r2_score(y_train, train_pred))

X_test_poly= best_poly_reg.transform(X_test)
test_pred= best_pol_reg.predict(X_test_poly)
print("X_test")
print("RMSE:",np.sqrt(mean_squared_error(y_test,test_pred)))
print("r2 score:",r2_score(y_test, test_pred))

X_train
RMSE : 52.59500254243814
R2 score : 0.41310325295885686
X_test
RMSE: 52.4789330687462
r2 score: 0.40411255808499236


In [36]:
# Compute results for polynomial regression
X_evaluate_poly= best_poly_reg.transform(X_evaluation)
evaluate_pred_final= best_pol_reg.predict(X_evaluate_poly)
evaluate_pred_final=np.squeeze(evaluate_pred_final)
pd.DataFrame({'id': df_evaluation.id, 'price': evaluate_pred_final}).to_csv('submission1.csv', index=False)

In [38]:
# Fit and test ridge regression
rr =Ridge(fit_intercept=False)
rr.fit(X_train, y_train)

train_pred= rr.predict(X_train)

print("RMSE:",np.sqrt(mean_squared_error(y_train,train_pred)))
print("r2 score:",r2_score(y_train, train_pred))

test_pred= rr.predict(X_test)
print("RMSE:",np.sqrt(mean_squared_error(y_test,test_pred)))
print("r2 score:",r2_score(y_test, test_pred))

RMSE: 126.56564277487406
r2 score: -2.3986340145923
RMSE: 124.80104006552592
r2 score: -2.3700058004004783


In [39]:
evaluate_pred_final= rr.predict(X_evaluation)
evaluate_pred_final=np.squeeze(evaluate_pred_final)
pd.DataFrame({'id': df_evaluation.id, 'price': evaluate_pred_final}).to_csv('submission2.csv', index=False)

In [40]:
# Fit and test Lasso Regression
lr = Lasso(fit_intercept=False)
lr.fit(X_train,y_train)

train_pred= lr.predict(X_train)
print("RMSE:",np.sqrt(mean_squared_error(y_train,train_pred)))
print("r2 score:",r2_score(y_train, train_pred))

test_pred= lr.predict(X_test)
print("RMSE:",np.sqrt(mean_squared_error(y_test,test_pred)))
print("r2 score:",r2_score(y_test, test_pred))

RMSE: 126.58335543325883
r2 score: -2.3995853478976286
RMSE: 124.86866663335701
r2 score: -2.3736590339458745


In [41]:
evaluate_pred_final= lr.predict(X_evaluation)
pd.DataFrame({'id': df_evaluation.id, 'price': evaluate_pred_final}).to_csv('submission3.csv', index=False)