In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/rental-price-of-indias-it-capital-pune-mh-ind/train.csv')

print(df.shape)
df.head()

In [None]:
df.describe()

# EDA

## Missing Values

In [None]:
df.isnull().sum()

## Target Variable

In [None]:
target = 'rent'

In [None]:
df.sort_values(by = [target], ascending = False).head()

There is some discrepancy in the first entry as the price is too large

In [None]:
df.drop((df.loc[df['rent'] == max(df['rent'])]).index, inplace = True)

In [None]:
df.boxplot(column = target)
plt.show()

In [None]:
df[target].hist(bins = 15)
plt.xlabel(target)
plt.ylabel('count')
plt.show()

Since distribution is right skewed we can apply a log transformation

In [None]:
## log transformation

df[target] = np.log(df[target])

df[target].hist(bins = 15)
plt.xlabel(target)
plt.ylabel('count')
plt.show()

## Numerical Features

In [None]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O' and feature != target]

df[num_features].head()

### Discrete Features

In [None]:
dis_features = [feature for feature in num_features if len(df[feature].unique()) < 20 and feature != target]

print(dis_features)

#### vs Target Variable

In [None]:
for feature in dis_features:
    df.groupby(feature)[target].median().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.title('{} vs log({})_median'.format(feature, target))
    plt.show()

### Continuous Features

In [None]:
con_features = [feature for feature in num_features if feature not in dis_features and feature != target]

print(con_features)

#### Distribution

In [None]:
for feature in con_features:
    df[feature].hist(bins = 15)
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

#### Dealing with outliers

In [None]:
data = df.copy()

for feature in con_features:
    
    extreme = int(data[feature].median() + 2 * data[feature].std()) + 1
    print('Values being replaced: {}'.format(len(data.loc[data[feature] > extreme])))
    data[feature] = np.where(data[feature] > extreme, extreme, data[feature])
    data[feature] = data[feature] ** 0.5
    
    data[feature].hist(bins = 15)
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

#### vs Target Variable

In [None]:
data = df.copy()

for feature in con_features:
    
    extreme = int(data[feature].median() + 2 * data[feature].std()) + 1
    data[feature] = np.where(data[feature] > extreme, extreme, data[feature])
    data[feature] = data[feature] ** 0.5
    
    plt.scatter(data[feature], data[target])
    
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.show()

## Categorical Features

In [None]:
cat_features = [feature for feature in df.columns if feature not in num_features and feature != target]

df[cat_features].head()

In [None]:
for feature in cat_features:
    print('{}: {} categories'.format(feature, len(df[feature].unique())))

### Encoding addresses by average rent in the area

In [None]:
area = list(df['address'].astype(str))

area_new = []

for a in area:
    area_new.append(a[a.find(',') + 1: a.find(',', a.find(',') + 1)])
    
area_new = np.array(area_new)

In [None]:
df['address'] = area_new
df['expo_rent'] = np.e ** df['rent']
df.head()

In [None]:
area_mean = {}

for area in np.unique(area_new):
    area_mean[area] = df.loc[df['address'] == area]['expo_rent'].mean()
    
df['address_num'] = df['address'].map(area_mean)
df.head()

In [None]:
df['address_num'].hist(bins = 15)
plt.show()

In [None]:
df.drop(['address', 'expo_rent'], axis = 1, inplace = True)

### Dropping maintenance_amt 

In [None]:
df.drop(['maintenance_amt'], axis = 1, inplace = True)

In [None]:
cat_features.remove('address')
cat_features.remove('maintenance_amt')

### Distribution of categories

In [None]:
df['furnishing'] = np.where(df['furnishing'] == 'Unfurnishe', 'Unfurnished', df['furnishing'])
df['furnishing'].unique()

In [None]:
for feature in cat_features:
    df.groupby(feature)[target].count().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

### vs Target Variable

In [None]:
for feature in cat_features:
    df.groupby(feature)[target].mean().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

# Feature Engineering

In [None]:
df = pd.read_csv('/kaggle/input/rental-price-of-indias-it-capital-pune-mh-ind/train.csv')

print(df.shape)
df.head()

## Dropping Rows

In [None]:
df.drop((df.loc[df['rent'] == max(df['rent'])]).index, inplace = True)

## Encoding address feature

In [None]:
area = list(df['address'].astype(str))

area_new = []

for a in area:
    area_new.append(a[a.find(',') + 1: a.find(',', a.find(',') + 1)])
    
df['address'] = np.array(area_new)

In [None]:
area_mean = {}

for area in np.unique(np.array(area_new)):
    area_mean[area] = df.loc[df['address'] == area]['rent'].mean()
    
df['address_num'] = df['address'].map(area_mean)

## Transformation of Numerical Features

In [None]:
## log transformation of target variable

df[target] = np.log(df[target])

In [None]:
## replacement of outliers and square root transformation of continuous features

for feature in con_features:
    
    extreme = int(df[feature].median() + 2 * df[feature].std()) + 1
    df[feature] = np.where(df[feature] > extreme, extreme, df[feature])
    df[feature] = df[feature] ** 0.5

## Dropping Features

In [None]:
df.drop(['maintenance_amt', 'address'], axis = 1, inplace = True)

## Encoding of Categorial Variables

In [None]:
dummy_df = pd.get_dummies(df, drop_first = True)

print(dummy_df.shape)
dummy_df.head()

# Feature Selection

## Dropping using Correlation

In [None]:
cor = dummy_df[num_features].corr()

plt.figure(figsize = (15, 6))
sns.heatmap(cor, annot = True, cmap = plt.cm.CMRmap_r)
plt.show()

We can drop bedroom or bathrooms

In [None]:
dummy_df.drop(['bathrooms'], axis = 1, inplace = True)

## Selection using Lasso

In [None]:
X = dummy_df.drop(['rent'], axis = 1)
y = dummy_df['rent']

In [None]:
scaler = MinMaxScaler()

scaler.fit(X)
X_scaled = scaler.transform(X)

In [None]:
feature_sel_model = SelectFromModel(Lasso(alpha = 0.005, random_state = 0))
feature_sel_model.fit(X_scaled, y)   

In [None]:
selected_features = X.columns[(feature_sel_model.get_support())]

print('total features:', X.shape[1])
print('selected features:', len(selected_features))
print('features with coefficients shrunk to 0:', np.sum(feature_sel_model.estimator_.coef_ == 0))

In [None]:
print('Selected Features:\n', selected_features)

In [None]:
X_lasso = X[selected_features]

# Models

In [None]:
X = X_lasso
y = dummy_df['rent']

## Scaling

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
scaler = MinMaxScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear Regression

In [None]:
scores = []

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

print(np.mean(scores))

## Random Forest Regression

In [None]:
scores = []

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

print(np.mean(scores))

## XGBoost Regression

In [None]:
scores = []

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    
    model = XGBRegressor()
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

print(np.mean(scores))