In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/us-airbnb-open-data/AB_US_2020.csv',low_memory=False)

### EDA 

In [None]:
df.describe()

In [None]:
df.describe(include=['O'])

In [None]:
df.columns

In [None]:
df.isnull().sum()

### Removing null values and unecessary columns

In [None]:
df['neighbourhood_group'].fillna('Others',inplace=True)
df.drop(['name','host_name'],axis=1,inplace=True)
df['last_review'] = pd.to_datetime(df['last_review'],infer_datetime_format=True)
df['reviews_per_month'].fillna(df['reviews_per_month'].mean(),inplace=True)
df["last_review"] = df["last_review"].replace(np.nan, df["last_review"].mode().iloc[0])
df.drop(['id','host_id'],axis=1,inplace=True)

### label encoding categorical columns

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['neighbourhood_group'] = label_encoder.fit_transform(df['neighbourhood_group'])
df['neighbourhood'] = label_encoder.fit_transform(df['neighbourhood'])
df['room_type'] = label_encoder.fit_transform(df['room_type'])
df['city'] = label_encoder.fit_transform(df['city'])

### change date values to ordinal 

In [None]:
import datetime 
df['last_review'] = df['last_review'].map(datetime.datetime.toordinal)

### Remove outliers

In [None]:
from scipy import stats
z_scores = stats.zscore(df)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
new_df = df[filtered_entries]

### Scaling Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#scaler.fit(new_df)
scaled_features = scaler.fit_transform(new_df)
scaled_features_df = pd.DataFrame(scaled_features, index=new_df.index, columns=new_df.columns)

In [None]:
scaled_features_df.head()

### Creating train and test data

In [None]:
from sklearn.model_selection import train_test_split
X = scaled_features_df.drop('price',axis=1)
y = scaled_features_df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size = 0.2, random_state=42)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(scaled_features_df.corr(),annot=True)

### Check multi collinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 

# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) 
                          for i in range(len(X_train.columns))] 
  
print(vif_data)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error,r2_score
import math
print('MSE',mean_squared_error(y_test, y_pred))
print('RMSE',math.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score',r2_score(y_test, y_pred))
print('Adj R^2 value:',1 - (1-regressor.score(X_test, y_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))

### Grid Search CV for Linear Regression

In [None]:
# define model evaluation method
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
from sklearn.model_selection import GridSearchCV
lin_model = LinearRegression()
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(lin_model,parameters, cv=cv)
grid.fit(X_train, y_train)
lin_pred = grid.predict(X_test)

In [None]:
print('MSE',mean_squared_error(y_test, y_pred))
print('RMSE',math.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score',r2_score(y_test, y_pred))
print('Adj R^2 value:',1 - (1-regressor.score(X_test, y_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))

In [None]:
import statsmodels.api as sm
X = sm.add_constant(X)
stats_model = sm.OLS(y,X)
results = stats_model.fit()
print ("r2/variance : ", results.rsquared)
print(results.summary())

In [None]:
### The p value for columns last_review and calculated_host_listings_count is higher than 0.05, so we need to remove the columns

### Remove columns and restart regression 

In [None]:
X1 = X.drop(['last_review','calculated_host_listings_count'],axis=1)

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y , test_size = 0.2, random_state=42)

In [None]:
import statsmodels.api as sm
X1 = sm.add_constant(X1)
stats_model1 = sm.OLS(y,X1)
results = stats_model1.fit()
print ("r2/variance : ", results.rsquared)
print(results.summary())

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X1_train, y_train)
y_pred = regressor.predict(X1_test)

In [None]:
print('MSE',mean_squared_error(y1_test, y_pred))
print('RMSE',math.sqrt(mean_squared_error(y1_test, y_pred)))
print('R2 Score',r2_score(y1_test, y_pred))
print('Adj R^2 value:',1 - (1-regressor.score(X1_test, y1_test))*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1))

In [None]:
'''Even after removing the two columns we were still not able to improve the scores'''

In [None]:
###Final RMSE: 0.933
### Other Regression models will be implemented