<a href="https://colab.research.google.com/github/sonasejidli/DATA-EXPLORATION-WITH-PYTHON/blob/main/DATA_EXPLORATION_WITH_PYTHON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df_train = pd.read_csv('../input/train.csv')

In [None]:
df_train.columns

In [None]:
df_train['SalePrice'].describe()

In [None]:
#histogram
sns.distplot(df_train['SalePrice']);

In [None]:
print('Skewness %f' % df_train['SalePrice'].skew())
print('Kurtosis: %f' % df_train['SalePrice'].kurt())

# 'SalePrice' Relationship with numerical variables

In [None]:
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

In [None]:
var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0.800000));

## ***Relationship with categorical features***

In [None]:
var = 'OverallQual'
data = pd.concat([df_train['SalePrice'], df_train[var]],axis=1)
f, ax = plt.subplots(figsize=(8,6))
fig = sns.boxplot(x=var, y='SalePrice', data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
var = 'YearBuilt'
data =  pd.concat([df_train['SalePrice'], df_train[var]],axis=1)
f, ax = plt.subplots(figsize=(16,8))
fig = sns.boxplot(x=var, y='SalePrice',data=data)
fig.axis(ymin=0,ymax=800000);
plt.xticks(rotation=90);

## *Correlation matrix (heatmap style)*

In [None]:
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=.8, square=True);

# 'SalePrice' correlation matrix (zoomed heatmap style)

In [None]:
k = 10 #number of variable for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True,fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

## **'SalePrice' and correlated variables**

In [None]:
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea','GarageCars','TotalBsmtSF', 'FullBath', 'YearBuilt' ]
sns.pairplot(df_train[cols], size = 2.5)
plt.show();

# Missing data

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total,percent], axis=1, keys=['Total','Percent'])
missing_data.head(20)

In [None]:
columns_to_drop = missing_data[missing_data['Total'] > 1].index  #
df_train = df_train.drop(columns=columns_to_drop)


df_train = df_train.dropna(subset=['Electrical'])


print(df_train.isnull().sum().max())

In [None]:
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)


# Bivariate analysis

In [None]:
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'],df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

In [None]:
#deleting points
df_train.sort_values(by = 'GrLivArea', ascending = False) [:2]
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)

In [None]:
#bivariate analysis saleprice/grlivarea
var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

# *In the search for normality*

In [None]:
#histogram and normal probability plot
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'],plot=plt)

In [None]:
df_train['SalePrice'] = np.log(df_train['SalePrice'])

In [None]:
#transformed histogram and normal probability plot
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)

In [None]:
#histogram and normal probability plot
sns.distplot(df_train['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)

In [None]:
#data transformation
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])

In [None]:
#data transformation
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])

In [None]:
#histogram and normal probability plot
sns.distplot(df_train['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'], plot=plt)

In [None]:
#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0
df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1

In [None]:
#transform data
df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])

In [None]:
#transform data
df_train.loc[df_train['HasBsmt']==1, 'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])

In [None]:
print(df_train['TotalBsmtSF'].isnull().sum())

In [None]:
#scatter plot
plt.scatter(df_train['GrLivArea'], df_train['SalePrice']);

# dummy variables

In [None]:
#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)