In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df=pd.read_csv('../input/train.csv')

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
plt.scatter(range(train_df.shape[0]),np.sort(train_df['price_doc']))
plt.show()

In [None]:
import seaborn as sns

In [None]:
sns.distplot(train_df['price_doc'],kde=True)

In [None]:
sns.distplot(np.log(train_df['price_doc']),kde=True)

In [None]:
train_df['timestamp'].head()

In [None]:
train_df['yearMonth']=train_df['timestamp'].apply(lambda x : x[:4]+x[5:7])

In [None]:
grouped_df = train_df.groupby('yearMonth')['price_doc'].aggregate(np.median).reset_index()

In [None]:
sns.barplot(grouped_df.yearMonth.values,grouped_df.price_doc.values,color='red')
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Year Month', fontsize=12)
plt.xticks(rotation='vertical')

plt.show()

In [None]:
dtype_df = train_df.dtypes.reset_index()

In [None]:
 dtype_df.columns=['count','column_type']

In [None]:
dtype_df = dtype_df.groupby('column_type').aggregate('count').reset_index()

In [None]:
dtype_df

In [None]:
missing_df = train_df.isnull().sum(axis=0).reset_index()

In [None]:
missing_df.columns = ['column_name','missing_count']

In [None]:
missing_df=missing_df.ix[missing_df['missing_count']>0]

In [None]:
ind = np.arange(missing_df.shape[0])

In [None]:
fig,ax=plt.subplots(figsize=(12,18))
ax.barh(ind,missing_df.missing_count.values)
ax.set_yticks(ind)
ax.set_yticklabels(missing_df.column_name.values,rotation='horizontal')
ax.set_xlabel("Count of missiong values")
ax.set_title("Number of Missing values")
plt.show()

In [None]:
from sklearn import  preprocessing, model_selection

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
import xgboost as xgb

In [None]:
for f in train_df.columns:
    if train_df[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values))
        
        
train_y = train_df.price_doc.values
train_X = train_df.drop(['id','timestamp','price_doc','yearMonth'],axis=1)

xgb_params = {
    'eta':0.05,
    'max_depth':8,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent':1
}

dtrain = xgb.DMatrix(train_X,train_y,feature_names=train_X.columns.values)
model = xgb.train(dict(xgb_params,silent=0),dtrain,num_boost_round=100)

fig,ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model,max_num_features=50,height=0.8,ax=ax)
plt.show()

In [None]:
ulimit = np.percentile(train_df.price_doc.values,99.5)
llimit = np.percentile(train_df.price_doc.values,0.5)
train_df['price_doc'].ix[train_df['price_doc'] > ulimit] = ulimit
train_df['price_doc'].ix[train_df['price_doc'] < llimit] = llimit

col = "full_sq"
ulimit = np.percentile(train_df[col].values, 99.5)
llimit = np.percentile(train_df[col].values, 0.5)
train_df[col].ix[train_df[col]>ulimit] = ulimit
train_df[col].ix[train_df[col]<llimit] = llimit

plt.figure(figsize=(12,12))
sns.jointplot(np.log(train_df.full_sq.values),np.log(train_df.price_doc.values))
plt.show()

In [None]:
col = "life_sq"
train_df[col].fillna(0, inplace=True)
ulimit = np.percentile(train_df[col].values, 95)
llimit = np.percentile(train_df[col].values, 5)
train_df[col].ix[train_df[col]>ulimit] = ulimit
train_df[col].ix[train_df[col]<llimit] = llimit

plt.figure(figsize=(12,12))
sns.jointplot(x=np.log1p(train_df.life_sq.values), y=np.log1p(train_df.price_doc.values), 
              kind='kde', size=10)
plt.ylabel('Log of Price', fontsize=12)
plt.xlabel('Log of living area in square metre', fontsize=12)
plt.show()

In [None]:
sns.countplot(x="floor", data=train_df)
plt.xticks(rotation='vertical')
plt.show()


In [None]:
sns.countplot(train_df.max_floor.values)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.boxplot(train_df.max_floor.values,train_df.price_doc.values)
plt.xticks(rotation='vertical')
plt.show()