In [None]:

import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = None

In [None]:
data = pd.read_csv('../input/apartment-rental-offers-in-germany/immo_data.csv')

In [None]:
data.head()

In [None]:
def one_encode(df, columns):
    dummies = pd.get_dummies(df[columns], prefix=columns)
    df = pd.concat([df, dummies], axis =1)
    df= df.drop(columns, axis=1)
    return df

In [None]:
def data_preprocess(data):
    
    # removing High-Cardinality features
    
    data.drop(["houseNumber","street","streetPlain","regio3","description","facilities"], axis=1, inplace =True)
    
    # Drop the columns having missing values more than 25%
    
    null_columns = data.loc[:,data.isna().mean() > 0.25].columns
    
    data = data.drop(null_columns, axis =1)
    
    # Construct the label columns
    
    data['isapartment'] = data['typeOfFlat'].apply(lambda x : 1 if x == 'apartment' else 0)
    
    data.drop('typeOfFlat', axis =1, inplace=True)
    
    # treating missing values in lable column
    
    null_label=data.loc[data['isapartment'].isna(), :].index
    
    data.drop(null_label, axis=1, inplace =True)
    
    # remaining missing values
    remaining_na_columns = data.loc[:, data.isna().sum() > 0]
    categorical_na_columns = remaining_na_columns.select_dtypes('object').columns
    numeric_na_columns = remaining_na_columns.drop(categorical_na_columns, axis=1).columns
    
    # fill numeric missing values with mean
    for columns in numeric_na_columns:
        data[columns] = data[columns].fillna(data[columns].mean())
        
    # fill categorical values with 'missing'
    for columns in categorical_na_columns:
        data[columns] = data[columns].fillna('missing')
    
    # convert boolean columns into int columns
    for columns in data.columns:
        if data[columns].dtypes == 'bool':
            data[columns] = data[columns].astype(np.int)
            
    # Spliting the date columns
    
    data['Month'] = data['date'].apply(lambda x:x[0:3])
    data['Month'] = data.Month.map({'May' : 5, 'Oct': 10, 'Feb' : 2, 'Sep' : 9})
    data['Year'] = data['date'].apply(lambda x:x[3:5])
    data['Year'] = data['Year'].apply(lambda x : '20' + x ).astype(np.int)
    data = data.drop('date', axis=1)
    
    # encode rest of the categorical columns
    
    for columns in data.select_dtypes('object'):
        data = one_encode(data, columns)
        
    # Split the dataset into X and Y
    
    X = data.drop('isapartment', axis=1)
    y = data['isapartment']
    
    X_train,X_test,y_train,y_test = train_test_split(X,y, train_size = 0.7, random_state =1)
    
    # Scale the data

    sc =  StandardScaler()
    sc.fit(X_train)
    
    X_train = pd.DataFrame(sc.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(sc.transform(X_test),columns=X.columns)
    
    return X_train,X_test,y_train,y_test

In [None]:
X_train,X_test,y_train,y_test = data_preprocess(data)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X_train,y_train)

In [None]:
print(RF.score(X_train,y_train))
print(RF.score(X_test,y_test))