In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
def convert_non_convertible_strings_to_na(value):
    try:
        test = int(value)
    except:
        new_value = "-1"
    else:
        new_value = value
    return new_value

In [None]:
def clean(data):
    data["Year"] = data.Year.apply(lambda x: "".join(c for c in x if c.isdigit()) if isinstance(x, str) else -1)
    data["Year"] = data.Year.apply(lambda x: -1 if not x else int(x))
    
    # Drop weird values
    data["Area"] = data["Area"].fillna("-1")
    data["Area"] = data["Area"].apply(lambda x: convert_non_convertible_strings_to_na(x))
    data["Bedrooms"] = data["Bedrooms"].fillna("-1")
    data["Bedrooms"] = data["Bedrooms"].apply(lambda x: convert_non_convertible_strings_to_na(x))
    
    # Boolean features
    data["Extra Features"] = data["Extra Features"].fillna("").str.lower()
    data["Elevator"] = data["Extra Features"].str.contains("elevator")
    data["Semi-furnished"] = data["Extra Features"].str.contains("semi-furnished")
    data["Furnished"] = data["Extra Features"].str.contains("furnished")  & ~data["Extra Features"].str.contains("semi")
    data["Waterfront"] = data["Extra Features"].str.contains("water|fleuve|river|canal|lake")
    data["Basement"] = data["Extra Features"].str.contains("basement")
    
    # Categorical data conversion
    data["Region"] = label_encoder.fit_transform(data["Region"])
    
    # Drop columns
    data.drop(["Extra Features"], axis=1, inplace=True)
    
    # Fill na
    data.fillna(-1, inplace=True)
    return data

In [None]:
data = pd.read_csv("../data/rent.csv")
data = data.loc[:, ["Region", "Year", "Extra Features", "Area", "Bathrooms", "Bedrooms", "Rent"]]
label_encoder = LabelEncoder()

In [None]:
cleaned_data = clean(data)
X = data.loc[:, cleaned_data.columns != "Rent"]
y = data.Rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [None]:
estimator = RandomForestRegressor(n_estimators=5, max_depth=5)
estimator = LinearRegression(normalize=True)

In [None]:
estimator.fit(X_train, y_train)
estimator.score(X_test, y_test)