### Importing the required libraries and the dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action="ignore")
plt.style.use(['seaborn-bright','dark_background'])

In [None]:
data = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')
data.head()

In [None]:
data.isnull().sum()

### Computing the missing value percentage in data.

In [None]:
for i in data.columns:
    prct_missing = np.mean(data[i].isnull())
    print("{} = {}%".format(i,prct_missing*100))

## Treating the missing values
#### 1. Droping entire columns:- Whenever we see that column consists of maximum data missing we drop that column , otherwise we choose take another method to treat missing values.
#### 2. Replace the values:- Sometimes we can replace the missing values as per our requirements like by numeric value for numeric feature and by any string for categorical value.
#### 3. Imputing :- The missing values can be imputed by using the mean/average value or median value  for numeric feature and by most frequent i.e. mode for categorical feature.
#### 4. Drop the rows:- If the missing value percentage is very less as compare to total percentage of data we can drop the rows which consists of missing values as we done in this case. Here total_bedrooms feature contain only 1% data missing so we drop the rows with missing values.

In [None]:
data = data.dropna(axis=0)

## Identify outliers
#### Using the describe method or plots like histogram , heatmap , scatter plot ot box-plot we can identify outliers.

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('longitude',data = data,palette="Blues")
plt.title('Longitude',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('latitude',data = data,palette="plasma")
plt.title('Latitude',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('housing_median_age',data = data,palette="hsv")
plt.title('Housing Median Age',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('total_rooms',data = data,palette="flag")
plt.title('Total Rooms',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('total_bedrooms',data = data,palette="YlGn")
plt.title('Total Bedrooms',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('population',data = data,palette="PuOr")
plt.title('Population',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('households',data = data,palette="Purples")
plt.title('Households',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('median_income',data = data,palette="Reds")
plt.title('Median Income',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot('median_house_value',data = data,palette="bone")
plt.title('Median House Value',fontsize=15)
plt.show()

In [None]:
data.describe()

## Treating Outliers.
#### Like missing values we can drop the rows or columns , or replace the outliers with the  mean or median values.
#### We can drop the rows with value lower than lower whisker or upper than upper whisker.
#### Now here we replace the outliers with the lower whisker and upper whisker. These whiskers can be calculated by using inter quartile range(IQR). The formula for IQR is IQR = 3rd quantile - 1st quantile.

In [None]:
def IQR(col):
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    return iqr,q1,q3

In [None]:
def whisker(col):
    iqr,q1,q3 = IQR(col)
    lower_whisker = q1 - 1.5*iqr
    upper_whisker = q3 + 1.5*iqr
    return lower_whisker,upper_whisker

In [None]:
for col in data.columns:
    if data[col].dtype!="object":
        lw,uw = whisker(col)
        print("Feature:-{} Lower:-{} Upper:-{}".format(col,lw,uw))

In [None]:
def treat_outliers(value):
    lower_limit,upper_limit = whisker(col)
    data[col] = np.where(data[col]<lower_limit,lower_limit,data[col])
    data[col] = np.where(data[col]>upper_limit,upper_limit,data[col])

In [None]:
for col in data.columns:
    if data[col].dtype!="object":
        treat_outliers(col)

In [None]:
data.describe()

## Column with repetative value.
#### Whenever we see any case like a column consists of a maximum percent of same data or a data which is very less in percent as compare to other we can drop the entire column.

In [None]:
data.nunique()

#### Here other than ocean proximity all features are numeric. So ocean_proximity consists of 5 unique value out of which ISLAND consists of only 5 value counts. So by droping the rows with ISLAND in feature dosen't impact our dataset so much.

In [None]:
data['ocean_proximity'].value_counts()

In [None]:
data.drop(data[data['ocean_proximity']=="ISLAND"].index, inplace = True) 

In [None]:
data.shape

## Non relatable data.
#### Sometimes our data set consists of feature that does not impact our data set or is not inportant or informative we can drop the column.

## Duplicate values
#### Our data set may contain some rows with exact same data values for all features , we can drop that rows to lower dataset size.

In [None]:
data.shape

In [None]:
data_remove_duplicates = pd.DataFrame.drop_duplicates(data)

In [None]:
data_remove_duplicates.shape

## Inconsistence categorical values
#### Sometimes our data contain inconsistence data strings like iSlAnD or date and time 12-03-12 , we can treat these values by converting categorical values to lower case and extracting date, day , month ,time ,etc from date data.

#### Now converting the data in ocean proximity to lower case

In [None]:
data['ocean_proximity'] = data['ocean_proximity'].str.lower()

In [None]:
data['ocean_proximity'].value_counts()

#### Our data may contain spelling mistakes or white spaces in categorical values or measurement units in numeric values ,so can can replace white spaces or units and correct spells of categorical values.

In [None]:
data['ocean_proximity'] = data['ocean_proximity'].str.replace(" ","_")

In [None]:
data['ocean_proximity'].value_counts()