<a href="https://colab.research.google.com/github/sandipanpaul21/ML-Code-in-Python/blob/master/01_About_the_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Libraries
from sklearn import datasets
import pandas as pd

In [2]:
# Loading Dataset
boston_dataset = datasets.load_boston()

# Dataset Description
print(boston_dataset.DESCR) 

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
# Type of Dataset
print("Dataset Type : ", type(boston_dataset))

Dataset Type :  <class 'sklearn.utils.Bunch'>


In [4]:
# Dataset is a dictionary which holds key and value for each key
# We can view the keys() methods
print("Dataset is in Dictionary form, so the key present are as follow \n",boston_dataset.keys())

Dataset is in Dictionary form, so the key present are as follow 
 dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [5]:
# To check Number of Column and Rows are
print("INDEPENDENT COLUMNS : Number of Columns and Rows are ",boston_dataset.data.shape)
print("DEPENDENT COLUMNS : Number of Column and Rows are ", boston_dataset.target.shape)

INDEPENDENT COLUMNS : Number of Columns and Rows are  (506, 13)
DEPENDENT COLUMNS : Number of Column and Rows are  (506,)


In [6]:
# To check feature names
print("Indepedent columns, feature names are : \n",boston_dataset.feature_names)

Indepedent columns, feature names are : 
 ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [7]:
# Type of Dataset
print("Present Dataset Type : ",type(boston_dataset))

# Coverting to Data Frame
print("\nConverting to Dataframe.. \n")
boston_pd = pd.DataFrame(boston_dataset.data)
print("Converted Dataset Type : ",type(boston_pd))

Present Dataset Type :  <class 'sklearn.utils.Bunch'>

Converting to Dataframe.. 

Converted Dataset Type :  <class 'pandas.core.frame.DataFrame'>


In [8]:
# Top 10 Rows of the Dataset
print("Column names are missing !! \n",boston_pd.head())

Column names are missing !! 
         0     1     2    3      4   ...   8      9     10      11    12
0  0.00632  18.0  2.31  0.0  0.538  ...  1.0  296.0  15.3  396.90  4.98
1  0.02731   0.0  7.07  0.0  0.469  ...  2.0  242.0  17.8  396.90  9.14
2  0.02729   0.0  7.07  0.0  0.469  ...  2.0  242.0  17.8  392.83  4.03
3  0.03237   0.0  2.18  0.0  0.458  ...  3.0  222.0  18.7  394.63  2.94
4  0.06905   0.0  2.18  0.0  0.458  ...  3.0  222.0  18.7  396.90  5.33

[5 rows x 13 columns]


In [9]:
# All Column Name
boston_pd.columns = boston_dataset.feature_names

print("Top 5 Rows of the dataset \n",boston_pd.head())

Top 5 Rows of the dataset 
       CRIM    ZN  INDUS  CHAS    NOX  ...  RAD    TAX  PTRATIO       B  LSTAT
0  0.00632  18.0   2.31   0.0  0.538  ...  1.0  296.0     15.3  396.90   4.98
1  0.02731   0.0   7.07   0.0  0.469  ...  2.0  242.0     17.8  396.90   9.14
2  0.02729   0.0   7.07   0.0  0.469  ...  2.0  242.0     17.8  392.83   4.03
3  0.03237   0.0   2.18   0.0  0.458  ...  3.0  222.0     18.7  394.63   2.94
4  0.06905   0.0   2.18   0.0  0.458  ...  3.0  222.0     18.7  396.90   5.33

[5 rows x 13 columns]


In [10]:
# Checking number of Rows and Columns in both the dataset
print("Number of Columns and Rows in previous dataset : ", boston_dataset.data.shape)
print("Number of Rows in the dataset : ",len(boston_pd))
print("Number of Column in the dataset : ",len(boston_pd.columns))

print("\nNOTE : Target Column is MISSING !!!")

Number of Columns and Rows in previous dataset :  (506, 13)
Number of Rows in the dataset :  506
Number of Column in the dataset :  13

NOTE : Target Column is MISSING !!!


In [11]:
# Target Column 
boston_pd["MEDV"] = boston_dataset.target

print("Number of columns in dataset : ",len(boston_pd.columns))
print("Top 5 Rows of the dataset \n", boston_pd.head())

Number of columns in dataset :  14
Top 5 Rows of the dataset 
       CRIM    ZN  INDUS  CHAS    NOX  ...    TAX  PTRATIO       B  LSTAT  MEDV
0  0.00632  18.0   2.31   0.0  0.538  ...  296.0     15.3  396.90   4.98  24.0
1  0.02731   0.0   7.07   0.0  0.469  ...  242.0     17.8  396.90   9.14  21.6
2  0.02729   0.0   7.07   0.0  0.469  ...  242.0     17.8  392.83   4.03  34.7
3  0.03237   0.0   2.18   0.0  0.458  ...  222.0     18.7  394.63   2.94  33.4
4  0.06905   0.0   2.18   0.0  0.458  ...  222.0     18.7  396.90   5.33  36.2

[5 rows x 14 columns]


In [12]:
# For IRIS Dataset (For Classification Dataset)

# Load the dataset
iris_dataset = datasets.load_iris()

# Iris Dataset Information
print(iris_dataset.DESCR)

# Converting to DataFrame
iris_pd = pd.DataFrame(iris_dataset.data)

# Independent Column Name
iris_pd.columns = iris_dataset.feature_names

# Dependent Column Name
iris_pd["Class"] = iris_dataset.target

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [13]:
# Features of IRIS Dataframe
print("Column in IRIS Dataframe :",len(iris_pd.columns))
print("Rows in IRIS Dataframe :",len(iris_pd))
print("Top 6 Rows of the IRIS Dataframe \n",iris_pd.head())

Column in IRIS Dataframe : 5
Rows in IRIS Dataframe : 150
Top 6 Rows of the IRIS Dataframe 
    sepal length (cm)  sepal width (cm)  ...  petal width (cm)  Class
0                5.1               3.5  ...               0.2      0
1                4.9               3.0  ...               0.2      0
2                4.7               3.2  ...               0.2      0
3                4.6               3.1  ...               0.2      0
4                5.0               3.6  ...               0.2      0

[5 rows x 5 columns]
