In [None]:
"""Importing libraries"""
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.rcParams['figure.figsize'] = (20,10)
plt.style.use('dark_background')

import warnings
warnings.filterwarnings('ignore')

In [None]:
"""Importing Dataset"""
airbnb_raw = pd.read_csv('/kaggle/input/us-airbnb-open-data/AB_US_2020.csv')
airbnb_raw.head()

In [None]:
airbnb = airbnb_raw.copy()
airbnb.info()

## Data Cleaning and Exploration

#### Replacing Missing values with 0

In [None]:
airbnb = airbnb.replace(np.nan,0)

In [None]:
airbnb.info()

In [None]:
airbnb['neighbourhood_group'].unique()

In [None]:
sns.countplot('neighbourhood_group',data=airbnb)
plt.xticks(rotation=90)

There is a whole bunch of missing data on Neighbourhood group. But for the rest, a dozen cities constitutes the lot.

In [None]:
airbnb['neighbourhood'].describe()

Neighbourhood has both coded values and names.

In [None]:
airbnb['last_review'].describe()

15th March has more review than anyother days.. Interesting!!!

Changing date-time into numeric for further analysis.

In [None]:
airbnb['last_review'] = pd.to_datetime(airbnb['last_review'])
airbnb['last_review'] = pd.to_numeric(airbnb['last_review'])
airbnb['price'].corr(airbnb['last_review']) 
#can drop last_Review date too since it has very little correlation to price

Encoding the data befor going on with Correlation Analysis

In [None]:
airbnb.info()

Droping ID,Name,host_id,host_name,Latitude and Logitude for simplyfing the analysis.

In [None]:
airbnb = airbnb.drop(['id','name','host_id','host_name','latitude','longitude','last_review'],axis=1)

Encoding the data for Correlation analysis.

#### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

In [None]:
airbnb['neighbourhood_group'] = airbnb['neighbourhood_group'].replace(0,'null')
neighbourhood_group = DataFrame({'Neighbourhood_group':airbnb['neighbourhood_group'].unique()})
code = encoder.fit_transform(neighbourhood_group['Neighbourhood_group'])
neighbourhood_group['Code'] = code
neighbourhood_group

In [None]:
neighbourhood = DataFrame({'Neighbourhood':airbnb['neighbourhood'].unique()})
neigh_code = encoder.fit_transform(neighbourhood['Neighbourhood'])
neighbourhood['Code'] = neigh_code               
neighbourhood

In [None]:
room_type = DataFrame({'Room type':airbnb['room_type'].unique()})
room_code = encoder.fit_transform(room_type['Room type'])
room_type['Code'] = room_code
room_type

In [None]:
city = DataFrame({'City' : airbnb['city'].unique()})
city_code = encoder.fit_transform(city['City'])
city['Code'] = city_code
city

In [None]:
airbnb['neighbourhood_group'] = encoder.fit_transform(airbnb['neighbourhood_group'])
airbnb['neighbourhood'] = encoder.fit_transform(airbnb['neighbourhood'])
airbnb['room_type'] = encoder.fit_transform(airbnb['room_type'])
airbnb['city'] = encoder.fit_transform(airbnb['city'])

In [None]:
airbnb.info()

#### Normalising data

Normalising data to 0 to 100

In [None]:
def normalise(feature):
    nmx = 100
    nmn = 0
    
    mx = feature.max()
    mn = feature.min()
    
    return ((nmx-nmn) / (mx-mn) * (feature-mx) + nmx)

norairbnb = normalise(airbnb)

In [None]:
norairbnb.describe()

In [None]:
norairbnb['minimum_nights'] = norairbnb['minimum_nights'].astype(int)
norairbnb['reviews_per_month'] = norairbnb['reviews_per_month'].astype(int)

In [None]:
sns.heatmap(norairbnb.corr(),annot=True)

Interestingly none of the variables have much correlation to Price...!!

   # Model Selection

In [None]:
Y = norairbnb['price']
X = norairbnb.drop('price',axis=1)

#### Linear Regression with Cross Validation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lin_reg = LinearRegression()

MSEs = cross_val_score(lin_reg,X,Y,cv=5,scoring='neg_mean_squared_error')
mean_MSE = np.mean(MSEs)

print( f'Negative MSE which needs to be maximised is {mean_MSE}')

#### Ridge Regression with GridSearchCV

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_reg = Ridge()
parameters = {'alpha':[1,2,4]}

MSEr = GridSearchCV(ridge_reg,parameters,scoring='neg_mean_squared_error',cv=5)
MSEr.fit(X,Y)

print(f'Best Parameter is {MSEr.best_params_}')
print(f'Best Score/Neg MSE is {MSEr.best_score_}')     

#### Lasso Regression tuned with GridSearchCV

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

lasso_reg = Lasso()
parameters = {'alpha':[0.2,0.4,0.6,0.8,1,2]}

MSEl = GridSearchCV(lasso_reg,parameters,scoring='neg_mean_squared_error',cv=5)
MSEl.fit(X,Y)

print(f'Best Parameter is {MSEl.best_params_}')
print(f'Best Score/Neg MSE is {MSEl.best_score_}')     

It can be identified that Lasso Regression has lowest error with RMSE of 5.06

# **Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(X,Y)

##### Best FitLine

In [None]:
print(f'Best fit line is {reg.intercept_}')
print(f'Number of coeffcients are {len(reg.coef_)}')    

##### Coefficents

In [None]:
coef_df = DataFrame({'Variable':X.columns,'Coeff':reg.coef_})

In [None]:
coef_df

In [None]:
sns.catplot(x='Variable',y='Coeff',data=coef_df,kind='point')
plt.xticks(rotation=90)

 The room_type, last review has negative impact on the predictor, the price. That is by change in room_type, price might drops.
 Neighborhood group, avilability and City has a positive impact on Price.

# Prediction

In [None]:
reg1 = LinearRegression()

In [None]:
"""Train-test split"""
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

In [None]:
reg1.fit(x_train,y_train)

In [None]:
y_pred = reg1.predict(x_test)

In [None]:
rms = np.mean((y_pred-y_test)*2)
print(f'Root mean square error is {rms}')

In [None]:
pred_df = DataFrame({'Actual':y_test,'Predict':y_pred})
pred_df

In [None]:
sns.lmplot(x='Actual',y='Predict',data=pred_df)

In [None]:
sns.scatterplot(x=y_pred,y=(y_pred-y_test),data=pred_df)