In [None]:
import geopandas
from shapely.geometry import Point, Polygon
import plotly.express as ex
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import metrics

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv ('/kaggle/input/us-airbnb-open-data/AB_US_2020.csv', low_memory=False)   
bnb = pd.DataFrame(data)
print(bnb.shape)

In [None]:
bnb.head()

Checking for missing values

In [None]:
sns.pairplot(bnb)

In [None]:
sns.distplot(bnb['price'])

In [None]:
print(bnb.isnull().any())

In [None]:
miss = bnb.isna().sum()
miss /= bnb.shape[0]
miss *=100
miss = miss.to_frame().rename(columns={0:'Precentage Of Missing Values'})
miss

Some of the columns present null values with different percentage. Soo, we can start setting id as index and fixing all the null-values columns then after some data visualization we can see which remove and which keep in the analysis. Btw this is my first opinion for the variables: 

- DROP : name, host_name, neighbourhood_group (more than 40% is already "other"), last_review

For the moment just try to fill all the null values and then let's drop something

In [None]:
#bnb = bnb.set_index('id')

In [None]:
bnb['name'] = bnb['name'].fillna("Airbnb House")
bnb['host_name'] = bnb['host_name'].fillna('Airbnb Host')
bnb['neighbourhood_group'] = bnb['neighbourhood_group'].fillna('Other')
bnb['reviews_per_month'] = bnb['reviews_per_month'].fillna(0) 
bnb['last_review'] = bnb['last_review'].fillna('01/01/01')
bnb['last_review'] = pd.to_datetime(bnb['last_review']) 

In [None]:
print(bnb.isnull().any())

In [None]:
bnb.head()

In [None]:
bnb.dtypes

### **Data Visualization**

I would like to focus first on the cities, to have a better understanding of the distribution of the houses all over the country.

In [None]:
df_coord = bnb.filter(['city','latitude','longitude'], axis = 1)
cities = list(set(bnb['city']))
bnb.loc[bnb['city'].isin(cities),'city'].value_counts()

In [None]:
plt.figure(1, figsize=(50, 28)) 
ax = sns.countplot(y=bnb['city'],order=bnb['city'].value_counts().index)
ax.set_yticklabels(ax.get_yticklabels(),fontsize=35)
plt.rc('xtick',labelsize=30)
ax.set_title('Distribution of the Airbnb houses in the U.S. cities',fontsize=50)
ax.set_xlabel('Count',fontsize=40)
ax.set_ylabel('US Cities', fontsize=40)
plt.show()

In [None]:
gdf = geopandas.GeoDataFrame(
    df_coord, geometry=geopandas.points_from_xy(df_coord.longitude, df_coord.latitude))

In [None]:
states = geopandas.read_file('/kaggle/input/american-geopandas-map/cb_2018_us_state_20m.shp')
type(states)

In [None]:
states = states.drop(25) # Drop Alaska
states = states.drop(7) # Drop PuertoRico

In [None]:
fig = plt.figure(1, figsize=(50, 28)) 

ax = fig.add_subplot()
plt.title('Airbnb Houses Location in the U.S. country', fontsize=50)
states.boundary.plot(ax=ax, color='black', linewidth=.8)
states.apply(lambda x: ax.annotate(s=x.NAME, xy=x.geometry.centroid.coords[0], ha='center', fontsize=12),axis=1);
gdf.plot(ax=ax, color='red', marker='.', markersize=10)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.show()

Let's create a dictionary to have the state for each city and to plot them.

In [None]:
states_dic = {'Asheville':'NC','Austin':'TX','Boston':'MA','Broward County':'FL','Cambridge':'MA','Chicago':'IL','Clark County':'NV','Columbus':'OH','Denver':'CO','Hawaii':'HI','Jersey City':'NJ',
             'Los Angeles':'CA','Nashville':'TN','New Orleans':'LA','New York City':'NY','Oakland':'CA','Pacific Grove':'CA','Portland':'OR','Rhode Island':'RI','Salem':'MA','San Clara Country':'CA',
             'Santa Cruz County':'CA','San Diego':'CA','San Francisco':'CA','San Mateo County':'CA','Seattle':'WA','Twin Cities MSA':'MN','Washington D.C.':'DC'}

bnb['state'] = bnb['city'].apply(lambda x : states_dic[x])

In [None]:
houses = bnb['state'].value_counts()
fig = ex.choropleth(locations=houses.index,color=houses.values, locationmode="USA-states", 
                    scope="usa",title='Distribution of houses by State', 
                    color_continuous_scale=ex.colors.diverging.Portland)
fig.show()

In [None]:
price_med = round(bnb.groupby(by='state').mean(),2)
fig = ex.choropleth(price_med,locations=price_med.index,color='price', locationmode="USA-states",
                    scope="usa",title='Average price of houses in each State', 
                    color_continuous_scale=ex.colors.diverging.Portland)
fig.show()

So most of the houses are located in California but the most expensive ones are in Minnesota.

In [None]:
bnb.hist()
plt.rcParams["figure.figsize"] = [16,9]
plt.show()

In [None]:
corrMatrix = bnb.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
bnb_x = bnb.copy()
bnb_x.drop(['name','host_name','last_review'],axis=1,inplace=True)

In [None]:
labelencoder = LabelEncoder()
bnb_x['neighbourhood'] = labelencoder.fit_transform(bnb_x["neighbourhood"])

In [None]:
dmm = pd.get_dummies(bnb_x.neighbourhood_group, prefix='NG')
dmm1 = pd.get_dummies(bnb_x.room_type, prefix='RT')
dmm2 = pd.get_dummies(bnb_x.city, prefix = 'CY')
#dmm3 = pd.get_dummies(bnb_x.state, prefix= 'ST')

In [None]:
bnb_x = bnb_x.join(dmm)
bnb_x = bnb_x.join(dmm1)
bnb_x = bnb_x.join(dmm2)
#bnb_x = bnb_x.join(dmm3)

In [None]:
bnb_x.head()

In [None]:
bnb_x.drop(['room_type','neighbourhood_group'],axis=1,inplace=True)
bnb_x.drop(['city','state'],axis=1,inplace=True)

In [None]:
X = bnb_x.drop('price',axis=1)
y = bnb_x.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
print(lr.intercept_)

In [None]:
print(lr.coef_)

In [None]:
cdf = pd.DataFrame(lr.coef_,X.columns, columns = ['Coeff'])

In [None]:
cdf

## Predictions

In [None]:
predictions = lr.predict(X_test)

In [None]:
plt.scatter(y_test,predictions)

In [None]:
sns.distplot((y_test-predictions))

In [None]:
print('MAE :', metrics.mean_absolute_error(y_test,predictions))

In [None]:
print('MSE :', metrics.mean_squared_error(y_test,predictions))

In [None]:
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test,predictions)))