In [None]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Inspiration

Can we predict the price of each house in different regions?<br>
Can we describe a region using the names of listings in that region?<br>
What can we learn about different regions from the data?<br>
Based on different factors is it possible to recommend a title to the host for his/her listing?<br>
Can we estimate the popularity of a listing based on given features?

In [None]:
df = pd.read_csv('../input/us-airbnb-open-data/AB_US_2020.csv')

df.head(5)

### Few Information from the dataset

In [None]:
print("Shape of df : ",df.shape)
print("Unique neighbourhood : ", df['neighbourhood'].nunique())
print("City : ", df['city'].nunique())
print("\nUnique Cities :-  \n\n", df['city'].unique())
print("\nRoom Type :-  \n\n", df['room_type'].unique())

## Average price - group by Neighbourhood and Room type of a city

In [None]:
cityName = 'San Clara Country'    # Just input the city name

df2 = df.copy()
df2 = df2[(df2['city'] == cityName)] 

df2 = df2.groupby(['neighbourhood', 'room_type']).mean().price.reset_index().round(2)
df2 = df2.pivot(index='neighbourhood', columns='room_type', values='price')

df2

In [None]:
df2.plot(figsize=(18,6), kind='bar',alpha=0.75, rot=30)
plt.xlabel("Neighbourhoods of " + cityName)
plt.ylabel("Average Price")

### Percentage of Different Room Types in a City and their Total Price


In [None]:
cityName = 'Clark County'    # Just input the city name

df2 = df.copy()
df2 = df2[(df2['city'] == cityName)] 

df_count = df2.groupby(['room_type']).count().id
df_sumPrice = df2.groupby(['room_type']).sum().price

fig = plt.figure (figsize=(15,5))
#fig.suptitle('', size = 15)

ax1 = plt.subplot(1, 2, 1)
ax1.set_title('Percentage of Room types')
df_count.plot(kind='pie', autopct='%1.1f%%', label='', pctdistance=0.7)
plt.ylabel("")

ax2 = plt.subplot(1, 2, 2)
ax2.set_title('Total Price of different Room types')
df_sumPrice.plot(kind='pie', autopct='%1.1f%%', label='', pctdistance=0.7)
plt.ylabel("")


## Predict house price based on City, Neighbourhood, Room type and Reviews

In [None]:
df3 = df.copy()
df3 = df3[df3.city != 'Washington D.C.'] #problem with neighbourhood
df3 = df3[['city', 'neighbourhood', 'room_type', 'number_of_reviews', 'price']] 

df4 = pd.get_dummies(df3)



In [None]:
from sklearn.model_selection import train_test_split
from ast import literal_eval


y = df4.price
X = df4.drop('price', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # 90% training and 10% test
#Selecting Feature ? 
#X_train, X_test, fs = select_features(X_train, y_train, X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import metrics
import numpy as np


lm = LinearRegression()
#lm = Ridge(alpha=1.0)


lm.fit(X_train,y_train)

predictions = lm.predict(X_test)

In [None]:
import seaborn as sns

print('MAE:', metrics.mean_absolute_error(y_test, predictions).round(2))
print('MSE:', metrics.mean_squared_error(y_test, predictions).round(2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)).round(2))

In [None]:
for i in range(1, len(predictions), 1000):  #Only showing every 100th value
    print("Prediction : ", predictions[i].round(2), "\t; Actual : ", y_test.iloc[i])
    

### I hope you find this kernel useful in your day to day work.
## Please do leave your comments and if you like this kernel greatly appreciate an UPVOTE