In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier, BaggingClassifier, \
    AdaBoostClassifier, GradientBoostingClassifier
warnings.filterwarnings('ignore')

#Load The data, turn the date into datetime type
train_df = pd.read_csv("../input/train.csv")
pd.to_datetime(train_df['timestamp'])
train_df['yearmonth'] = train_df['timestamp'].astype(datetime.datetime)
train_df['yearmonth'] = train_df['timestamp'].apply(lambda x: x[:4]+x[5:7])
train_df['month'] = train_df['timestamp'].apply(lambda x: x[5:7])
# Calculate the price per one **life_sq**
price_per_life_sq = train_df['price_doc']/train_df['life_sq']
price_per_life_sq_clean = price_per_life_sq.dropna()
#SberbBank EDA
#Shape Of Dataset
train_df.shape
pd.options.display.max_columns = 30
train_df.head().transpose()

#Missing Value Analysis
train_na = (train_df.isnull().sum() / len(train_df)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)
f, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
ax.set(title='Percent missing data by feature', ylabel='% missing')
plt.show()

#find the characture with the highet correlation
corr_val = train_df.corr()['price_doc']
ordered_indecis = np.abs(corr_val).sort_values(ascending =False).index
high_corr_col = corr_val[np.abs(corr_val) >0.28].index
high_corr_col
corrmat = train_df[high_corr_col].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)
plt.show()
#Descrptive anlaysis
#First we will deal with the missing values
v = list(high_corr_col)
v_no_na = train_df[v].dropna()
len(train_df[high_corr_col])
len(train_df[high_corr_col].dropna())
v_no_na

#We will see the corralation after dealing with the missing values
corrmat_no_na = train_df[high_corr_col].dropna().corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)
plt.show()
np.log10(train_df['price_doc']).describe()
#Bivariate Analysis
#Lets Us Understand Relationship Between Top Correlative Features And Price Doc
#num_room
num_room_no_na_price = train_df['num_room'].dropna().corr(train_df['price_doc'])
num_room_no_na_price
f, ax = plt.subplots(figsize=(10, 7))
plt.scatter(x=train_df['num_room'], y=np.log10(train_df['price_doc']), c='r', alpha=0.5)
ax.set(title='Price by number of rooms', xlabel='number of rooms', ylabel='Price')
plt.show()
#max_floor
plt.figure(figsize=(12,8))
sns.boxplot(x="max_floor", y= np.log10(train_df['price_doc']), data=train_df)
plt.ylabel('log price', fontsize=12)
plt.xlabel('max_floor', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
#full_sq
f, ax = plt.subplots(figsize=(10, 7))
ind = train_df[train_df['full_sq'] > 2000].index
plt.scatter(x=train_df.drop(ind)['full_sq'], y=np.log10(train_df.drop(ind)['price_doc']), c='b', alpha=0.5)
ax.set(title='Price by area in sq meters', xlabel='Area', ylabel='Price')
plt.show()

#sadovoe_km
f, ax = plt.subplots(figsize=(10, 7))
plt.scatter(x=train_df['sadovoe_km'], y=np.log10(train_df['price_doc']), c='g', alpha=0.5)
ax.set(title='Price by area in km from sadovoe', xlabel='Area', ylabel='Price')
plt.show()

#zd_vokzaly_avto_km
f, ax = plt.subplots(figsize=(10, 7))
plt.scatter(x=train_df['zd_vokzaly_avto_km'], y=np.log10(train_df['price_doc']), c='y', alpha=0.5)
ax.set(title='Price by area in km from zd_vokzaly_avto', xlabel='Area', ylabel='Price')
plt.show()

#sport_count_3000
f, ax = plt.subplots(figsize=(10, 7))
plt.scatter(x=train_df['sport_count_3000'], y=np.log10(train_df['price_doc']), c='b', alpha=0.5)
ax.set(title='Price by The number of sport facilities in 3000 meters zone', xlabel='sport facilities', ylabel='Price')
plt.show()

#trc_count_5000
f, ax = plt.subplots(figsize=(10, 7))
plt.scatter(x=train_df['trc_count_5000'], y=np.log10(train_df['price_doc']), c='r', alpha=0.5)
ax.set(title='Price by The number of shopping malls in 5000 meters zone', xlabel='shopping malls', ylabel='Price')
plt.show()

#Outliers Analysis
plt.figure(figsize=(12,8))
sns.distplot(np.log10(train_df.price_doc.values), bins=1000, kde=True)
plt.xlabel('log price', fontsize=12)
plt.show()
plt.figure(figsize=(12,8))
sns.boxplot(x="yearmonth", y= np.log10(train_df['price_doc']), data=train_df)
plt.ylabel('log price', fontsize=12)
plt.xlabel('yearmonth', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
plt.figure(figsize=(12,8))
sns.boxplot(x="month", y= np.log10(train_df['price_doc']), data=train_df)
plt.ylabel('log price', fontsize=12)
plt.xlabel('month', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# Preprocess
tr = train_df.select_dtypes(exclude=['object'])
train_fillna = tr.fillna(0)
clf = RandomForestRegressor(n_estimators=100)
X = train_fillna.drop(['price_doc'], axis=1)
y = train_fillna['price_doc']
# split the data
X_train, X_test, y_train, y_test = split(X, y, train_size=0.7)
#Fit the model
clf.fit(X_train, y_train)
clf.score(X_train, y_train)
clf.score(X_test, y_test)