# Explore here

In [1]:
# Your code here
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Step 1: Data collection

In [None]:
import pandas as pd

data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/data-preprocessing-project-tutorial/main/AB_NYC_2019.csv")
data.head()

#### Step 2: Exploration and data cleaning

In [None]:
# Obtain dimensions
data.shape

In [None]:
print(f"The DataFrame contains {len(data)} records (rows) and {len(data.columns)} variables (columns).\n")

In [None]:
# Obtain information about data types and non-null values
data.info()

In [None]:
print("We see that we have some NaN records, the biggest ones being last_review and reviews_per_month columns.")

In [None]:
categorical_columns=["id", "name", "host_id", "host_name", "neighbourhood_group", "neighbourhood", "room_type", "availability_365"]
df_categorical_columns=data[categorical_columns]
df_categorical_columns.head()

In [None]:
numerical_columns=["price", "latitude", "longitude", "minimum_nights", "number_of_reviews", "last_review", "reviews_per_month", "calculated_host_listings_count"]
df_numerical_columns=data[numerical_columns]
df_numerical_columns.head()

In [None]:
print("Analysing the data we can divide the data in 8 categorical variables and 8 numerical variables.\n")
print(f"Categorical variables:{df_categorical_columns.columns}\n")
print(f"Numerical variables:{df_numerical_columns.columns}\n")

Eliminate duplicates

In [None]:
data_dropped=data.dropna()
data_dropped.head()

In [None]:
data_dropped.shape

In [None]:
print(f"I eliminated the rows with NaN values, reducing the DataFrame from ({len(data)}, {len(data.columns)}) to ({len(data_dropped)}, {len(data_dropped.columns)})")
print(f"Doing that I eliminated {(len(data))-(len(data_dropped))} rows, deleting {round((1-((len(data_dropped))/(len(data))))*(100),2)}% of the data, which may be significant for the analysis.")

In [None]:
print(f"The number of duplicated Name records is: {data['name'].duplicated().sum()}")
print(f"The number of duplicated Host ID records is: {data['host_id'].duplicated().sum()}")
print(f"The number of duplicated ID records is: {data['id'].duplicated().sum()}")

In [None]:
print("name has duplicated values, which is odd, but duplicates can exist, since people can put the same names")
print("host_id can have duplicates, because some homeowners have multiple Airbnbs registered.")
print("There are 0 duplicated id, which means it should be all unique records.")

Eliminate irrelevant information

In [None]:
print("Eliminating the columns with duplicate values is a better solution in order to redice the zise of our DataFrame")
print("I also remove the latitude and longitude columns to simplify reading the DataFrame")

In [None]:
data_c=data.copy()
data_c.head()

In [None]:
data_c.drop(["id", "name", "host_name", "last_review", "reviews_per_month", "latitude", "longitude"], axis = 1, inplace = True)
data_c.head()

In [None]:
data_c.info()

### Step 3: Analysis of univariate variables

Analysis of categorical variables

In [None]:
new_categorical_columns=["host_id", "neighbourhood_group", "neighbourhood", "room_type", "availability_365"]
df_new_categorical_columns=data_c[new_categorical_columns]
df_new_categorical_columns.head()

In [None]:
host_id=data_c["host_id"].value_counts()
print(f"{host_id}\n")

In [None]:
plt.figure(figsize=(6,6))
sns.histplot(data = data_c, x = "host_id").set(xlabel ="Host Id", ylabel = "Count", title =None)
plt.tight_layout()
plt.show()

In [None]:
print("There are multiple Hosts with many rooms to rent.")

In [None]:
neighbourhood_group=data_c["neighbourhood_group"].value_counts()
print(f"{neighbourhood_group}\n")

In [None]:
plt.figure(figsize=(6,6))
sns.barplot(x=neighbourhood_group.index, y=neighbourhood_group.values).set(xlabel ="Neighbourhood Group", ylabel = "Count", title =None)
plt.tight_layout()
plt.show()

In [None]:
print("In our dataset, we only have 5 areas, Brooklyn, Manhattan, Queens, Staten Island, and the Bronx")
print("Most of the houses offered are in Manhattan follewed by Brooklyn and Queens")

In [None]:
neighbourhood=data_c["neighbourhood"].value_counts()
print(f"{neighbourhood}\n")

In [None]:
plt.figure(figsize=(6,6))
fg3=sns.histplot(data = data_c, x = "neighbourhood").set_xticks([])
plt.tight_layout()
plt.show()

In [None]:
print("In the same way the offered houses are concetrated in some neighborhoods.")

In [None]:
room_type=data_c["room_type"].value_counts()
print(f"{room_type}\n")

In [None]:
plt.figure(figsize=(6,6))
sns.barplot(x=room_type.index, y=room_type.values).set(xlabel ="Room Type", ylabel = "Count", title =None)
plt.tight_layout()
plt.show()


In [None]:
print("There are more availability of entire Homes/Apts to rent than Private Rooms, and Shared Rooms is the least available type of room.")

In [None]:
availability_365=data_c["availability_365"].value_counts()
print(f"{availability_365}\n")

In [None]:
plt.figure(figsize=(6,6))
sns.histplot(data = data_c, x = "availability_365").set(xlabel = "Availability 365", ylabel = "Count", title = None)
plt.tight_layout()
plt.show()

In [None]:
print("Most of the houses are not available during the year, however, there are plenty of them that are available all the year")

Analysis on numeric variables

In [None]:
new_numerical_columns=["price", "minimum_nights", "number_of_reviews", "calculated_host_listings_count"]
df_new_numerical_columns=data_c[new_numerical_columns]
df_new_numerical_columns.head()

In [None]:
plt.figure(figsize=(6,6))
f,(box,hist)=plt.subplots(2,sharex=True)
sns.histplot(x=data_c["price"],ax=hist)
sns.boxplot(x=data_c["price"],ax=box)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,6))
f,(box,hist)=plt.subplots(2,sharex=True)
sns.histplot(x=data_c["minimum_nights"],ax=hist)
sns.boxplot(x=data_c["minimum_nights"],ax=box)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,6))
f,(box,hist)=plt.subplots(2,sharex=True)
sns.histplot(x=data_c["number_of_reviews"],ax=hist)
sns.boxplot(x=data_c["number_of_reviews"],ax=box)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,6))
f,(box,hist)=plt.subplots(2,sharex=True)
sns.histplot(x=data_c["calculated_host_listings_count"],ax=hist)
sns.boxplot(x=data_c["calculated_host_listings_count"],ax=box)

plt.tight_layout()
plt.show()

In [None]:
print("Analyzing the graphs, there are many values ​​out of range.")

#### Step 4: Analysis of multivariate variables

Numerical-numerical analysis

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(data_c[new_numerical_columns].corr().round(3),vmax=1,vmin=-1,annot=True)

In [None]:
print("Looking at the above relationships, we see that the relationship between the numerical variables is very low")

Categorical-categorical analysis

In [None]:
df_new_categorical_columns

In [None]:
fig, axis = plt.subplots(figsize = (5, 4))

sns.countplot(data = data_c, x = "room_type", hue = "neighbourhood_group")

# Show the plot
plt.show()

In [None]:
print("Again, we see that the most of the available houses are located in Manhattan.")
print("There are not many shared rooms available for rent, by the other hand, there are many Entire home/aptments available for rent, meaning probably more people sharing their home or trying to maximize.")

Numerical-categorical analysis (complete)

In [None]:
data_c.info()

In [None]:
# Factorize the Room Type and Neighborhood Data
data_c["room_type"] = pd.factorize(data_c["room_type"])[0]
data_c["neighbourhood_group"] = pd.factorize(data_c["neighbourhood_group"])[0]
data_c["neighbourhood"] = pd.factorize(data_c["neighbourhood"])[0]

In [None]:
fig, axes = plt.subplots(figsize=(10, 10))

sns.heatmap(data_c[["host_id", "neighbourhood_group", "neighbourhood", "room_type", "price", "minimum_nights", "number_of_reviews", "calculated_host_listings_count", "availability_365"]].corr(), annot = True, fmt = ".2f")

plt.tight_layout()

# Draw Plot
plt.show()

In [None]:
print("Looking at the data here, there's a very low correlation between all the data.")
print("As expected, there is a strong relationship between neighborhoods and Boroughs.")

Analysing all the Data at Once

In [None]:
sns.pairplot(data = data_c)

#### Step 5: Feature engineering

Outlier analysis

In [None]:
data_c.describe()

In [None]:
plt.figure(figsize=(6,6))
sns.boxplot(data = data_c, y = "neighbourhood_group")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.boxplot(data = data_c, y = "room_type")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.boxplot(data = data_c, y = "availability_365")
plt.tight_layout()
plt.show()

In [None]:
print("From previous analysis we can easily determine that the variables affected by outliers are price, minimum_nights, number_of_reviews, calculated_host_listings_count.")

Missing value analysis

In [None]:
data_c.isnull().sum().sort_values(ascending=False)

Feature scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

num_variables = ["number_of_reviews", "minimum_nights", "calculated_host_listings_count", 
                 "availability_365", "neighbourhood_group", "room_type"]
scaler = MinMaxScaler()
scal_features = scaler.fit_transform(data_c[num_variables])
df_scal = pd.DataFrame(scal_features, index = data_c.index, columns = num_variables)
df_scal["price"] = data_c["price"]
df_scal.head()

#### Step 6: Feature selection

In [None]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split

X = df_scal.drop("price", axis = 1)
y = df_scal["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


selection_model = SelectKBest(chi2, k = 4)
selection_model.fit(X_train, y_train)
ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()

Save the clean and raw data

In [None]:
X_train_sel["price"] = list(y_train)
X_test_sel["price"] = list(y_test)
X_train_sel.to_csv("../data/processed/clean_train.csv", index = False)
X_test_sel.to_csv("../data/processed/clean_test.csv", index = False)