# Getting Information about the Dataset!

In [1]:
# importing libraries
import pandas as pd
import numpy as np

In [2]:
# importing iris dataset
 
# Reading the CSV file
iris = pd.read_csv("../data/iris_csv.csv")
iris

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


#### We will use the shape parameter to get the shape of the dataset:

In [3]:
iris.shape # output: (row, column)

(150, 5)

We can see that the dataframe contains 5 columns and 150 rows.

#### Now, let’s also see the columns, their data types and if any column is null. For this, we will use the info() method.

In [4]:
# basic information regarding the dataset
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sepallength  150 non-null    float64
 1   sepalwidth   150 non-null    float64
 2   petallength  150 non-null    float64
 3   petalwidth   150 non-null    float64
 4   class        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
# We can see that only one column has categorical data
# and all the other columns are of the numeric type with non-Null entries.

In [6]:
# Let’s get a quick statistical summary of the dataset using the describe() method.

The describe() function applies basic statistical computations on the dataset like extreme values, count of data points standard deviation, etc. Any missing value or NaN value is automatically skipped. describe() function gives a good picture of the distribution of data.

In [7]:
iris.describe()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


#### Task 1:
Import melbourne housing dataset and get information about that dataset.

Double-click <b>here</b> for the solution.

<!-- Soltuion is below:


# Solution:

# Reading the CSV file
melb_data = pd.read_csv("../data/melbourne_housing.csv")

melb_data.shape

melb_data.info()

melb_data.describe()

-->

# Missing and Duplicate Values!

We will check if our data contains any missing values or not. Missing values can occur when no information is provided for one or more items or for a whole unit. We will use the isnull() method.

Resources:

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html


##### Let’s see if our dataset contains any duplicates or not. Pandas duplicated method helps to check if we have any duplicate rows in the data.

In [72]:
# duplicated() -> keep = first (default)
dup = iris[iris.duplicated(keep= False)]
print(dup)

     sepallength  sepalwidth  petallength  petalwidth           class
9            4.9         3.1          1.5         0.1     Iris-setosa
34           4.9         3.1          1.5         0.1     Iris-setosa
37           4.9         3.1          1.5         0.1     Iris-setosa
101          5.8         2.7          5.1         1.9  Iris-virginica
142          5.8         2.7          5.1         1.9  Iris-virginica


In [58]:
iris.head(38)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


##### If you have duplicates in your data, drop_duplicates() method helps in removing duplicates from the data frame.

In [77]:
# drop_duplicates() -> keep = first (default)
iris.drop_duplicates(inplace = True)
iris.reset_index(inplace = True)
iris.head(38)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [64]:
iris.head(143)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
140,6.7,3.1,5.6,2.4,Iris-virginica
141,6.9,3.1,5.1,2.3,Iris-virginica
143,6.8,3.2,5.9,2.3,Iris-virginica
144,6.7,3.3,5.7,2.5,Iris-virginica


In [74]:
iris.shape

(147, 5)

##### Filling Null Values with MEDIAN / MEAN / MODE

In [78]:
iris.isnull().sum()

sepallength    0
sepalwidth     0
petallength    0
petalwidth     0
class          0
dtype: int64

We can see that no column has any missing value.

# Null:
##### if 60% or more missing values -> Delete that Column: 
##### if 60% or more missing values -> Delete row
##### if less than 60% missing values -> Fill that missing value

In [81]:
data = {"Name": ["Joe", "Harry", "Ben"], "Age": [35, 60, np.nan], "Weight": [70, np.nan, 49],
       "Height": [np.nan, 175, 180]}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age,Weight,Height
0,Joe,35.0,70.0,
1,Harry,60.0,,175.0
2,Ben,,49.0,180.0


In [82]:
df.isnull().sum()

Name      0
Age       1
Weight    1
Height    1
dtype: int64

In [15]:
#Finding the mean of the column having NaN
mean_value=df['Age'].mean()
  
# Replace NaNs in column S2 with the
# mean of values in the same column
df['Age'].fillna(value=mean_value, inplace=True)

In [16]:
df.head()

Unnamed: 0,Name,Age,Weight,Height
0,Joe,35.0,70.0,
1,Harry,60.0,,175.0
2,Ben,47.5,49.0,180.0


In [17]:
#Finding the mean of the column having NaN
mode_value=df['Weight'].mode()
  
# Replace NaNs in column S2 with the
# mean of values in the same column
df['Weight'].fillna(value=mode_value, inplace=True)

In [18]:
df.head()

Unnamed: 0,Name,Age,Weight,Height
0,Joe,35.0,70.0,
1,Harry,60.0,70.0,175.0
2,Ben,47.5,49.0,180.0


In [19]:
#Finding the mean of the column having NaN
median_value=df['Height'].median()
  
# Replace NaNs in column S2 with the
# mean of values in the same column
df['Height'].fillna(value=median_value, inplace=True)

In [20]:
df.head()

Unnamed: 0,Name,Age,Weight,Height
0,Joe,35.0,70.0,177.5
1,Harry,60.0,70.0,175.0
2,Ben,47.5,49.0,180.0


#### Task 2:


Import the melbourne_housing. Check if this data have any null values (if yes, fill) and duplicates (if yes, remove). Leave the column CouncilArea, we will see in the next task.

In [21]:
# Reading the CSV file
melb_data = pd.read_csv("../data/melbourne_housing.csv")
melb_data.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

Double-click <b>here</b> for the solution.

<!-- Soltuion is below:


# Solution:

# Reading the CSV file
melb_data = pd.read_csv("../data/melbourne_housing.csv")

# Checking Null values:
melb_data.isnull().sum()

# Checking number of rows and columns (shape)
melb_data.shape

dup = melb_data[melb_data.duplicated()]
print(dup)


# Replace NaNs in column Car with the mean of values as there are only 62 missing values
# Using backward fill to fill missing values
melb_data['Car'].fillna(method ='bfill', inplace=True)

# or:
# ffill and pad are synonyms for the same thing - forward filling
# melb_data['Car'].fillna(method ='pad', inplace=True)

melb_data.isnull().sum()


# We have almost 48% missing values in column BuildingArea
# Finding the mean of the column having NaN
mean_value2 = melb_data['BuildingArea'].mean()

# Replace NaNs in column BuildingArea with the mean of values in the same column
melb_data['BuildingArea'].fillna(value=mean_value2, inplace=True)
melb_data.isnull().sum()


# We have almost 40% missing values in column YearBuilt
#Finding the mean of the column having NaN
mean_value2 = melb_data['YearBuilt'].mean()
  
# Replace NaNs in column YearBuilt with the
# mean of values in the same column
melb_data['YearBuilt'].fillna(value=mean_value2, inplace=True)
melb_data.isnull().sum()

-->

# Columns

##### We will use the df.value_counts() function to return a Series containing counts of unique values. 

In [22]:
# iris["class"]
iris.value_counts("class")

class
Iris-versicolor    50
Iris-virginica     49
Iris-setosa        48
dtype: int64

Creating new column into existing dataframe

In [23]:
df.head()

Unnamed: 0,Name,Age,Weight,Height
0,Joe,35.0,70.0,177.5
1,Harry,60.0,70.0,175.0
2,Ben,47.5,49.0,180.0


In [24]:
df["BMI"] = df["Weight"] / df["Height"]
df.head()

Unnamed: 0,Name,Age,Weight,Height,BMI
0,Joe,35.0,70.0,177.5,0.394366
1,Harry,60.0,70.0,175.0,0.4
2,Ben,47.5,49.0,180.0,0.272222


##### Pandas.DataFrame().unique() method is used to return all unique elements of a column.

In [25]:
iris["class"].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

#### Task 3

Import melbourne_housing dataset. Count values in Rooms column. 
Return uniques elemennts of SellerG column. 
Create a new column called "Value" using the following formula: Rooms * Price.

Double-click <b>here</b> for the solution.

<!-- Soltuion is below:


# Solution:

# Reading the CSV file
melb_data = pd.read_csv("../data/melbourne_housing.csv")

melb_data.value_counts("Rooms")

melb_data["SellerG"].unique()

melb_data["Value"] = melb_data["Rooms"] * melb_data["Price"]

-->

# Sorting dataframe

In [25]:
information = {
  "age": [50, 40, 30, 40, 20, 10, 30],
  "qualified": [True, False, False, False, False, True, True]
}
other_df = pd.DataFrame(information)

In [26]:
other_df.head()

Unnamed: 0,age,qualified
0,50,True
1,40,False
2,30,False
3,40,False
4,20,False


In [27]:
sorted_df = other_df.sort_values(by='age') # default=ascending
sorted_df.head()

Unnamed: 0,age,qualified
5,10,True
4,20,False
2,30,False
6,30,True
1,40,False


In [37]:
sorted_df2 = other_df.sort_values(by='age',ascending = False) # descending

# if you want to allign all the index values in order, uncomment the below two lines:
#sorted_df2.reset_index(inplace = True)
#sorted_df2.drop("index", axis=1, inplace=True)

sorted_df2.head()

Unnamed: 0,age,qualified
0,50,True
1,40,False
3,40,False
2,30,False
6,30,True


# Convert a dataframe to csv:

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html

In [36]:
# when index = True, you will get index in the CSV file
# when index = False, you will not get the index in the CSV file
sorted_df2.to_csv('../data/output.csv', index=False)

#### Task 4:

Import melbourne_housing dataset. Sort Rooms in descending order and save the sorted dataframe into a csv file.

Double-click <b>here</b> for the solution.

<!-- Soltuion is below:


# Solution:

# Reading the CSV file
melb_data = pd.read_csv("../data/melbourne_housing.csv")

sorted_melb = melb_data.sort_values(by='Rooms',ascending = False)
sorted_melb.head()

-->

# Deleting data from Dataframe

Dropping Rows by index label

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html

In [30]:
iris.drop(labels=[0,2,4],inplace=True) # default: delete a row (axis=0)
# display
iris.head()

Unnamed: 0,index,sepallength,sepalwidth,petallength,petalwidth,class
1,1,4.9,3.0,1.4,0.2,Iris-setosa
3,3,4.6,3.1,1.5,0.2,Iris-setosa
5,5,5.4,3.9,1.7,0.4,Iris-setosa
6,6,4.6,3.4,1.4,0.3,Iris-setosa
7,7,5.0,3.4,1.5,0.2,Iris-setosa


Dropping Columns by column label

In [31]:
iris.drop(['sepalwidth','petallength'], axis=1, inplace=True) # delete a column (axis=1)
# display
iris.head()

Unnamed: 0,index,sepallength,petalwidth,class
1,1,4.9,0.2,Iris-setosa
3,3,4.6,0.2,Iris-setosa
5,5,5.4,0.4,Iris-setosa
6,6,4.6,0.3,Iris-setosa
7,7,5.0,0.2,Iris-setosa


#### Task 5

Import melbourne_housing dataset. Delete Value and CouncilArea columns from the data.

Double-click <b>here</b> for the solution.

<!-- Soltuion is below:


# Solution:

melb_data.drop(['CouncilArea','Value'], axis=1, inplace=True)
# display
melb_data.head()

-->

# Pandas GroupBy

In [39]:
# importing iris dataset again to retain all columns
 
# Reading the CSV file
iris = pd.read_csv("../data/iris_csv.csv")
iris

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [33]:
iris.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [40]:
grouped1 = iris['sepallength'].groupby(iris['class'])
grouped1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001AEA6983100>

In [41]:
grouped1.agg('mean')

class
Iris-setosa        5.006
Iris-versicolor    5.936
Iris-virginica     6.588
Name: sepallength, dtype: float64

In [42]:
grouped1.agg('min')

class
Iris-setosa        4.3
Iris-versicolor    4.9
Iris-virginica     4.9
Name: sepallength, dtype: float64

In [37]:
grouped2 = iris[['sepallength']].groupby(iris['class'])
grouped2

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EC41BBCA60>

In [38]:
grouped2.agg('sum')

Unnamed: 0_level_0,sepallength
class,Unnamed: 1_level_1
Iris-setosa,250.3
Iris-versicolor,296.8
Iris-virginica,329.4


In [39]:
grouped2.agg(['mean', 'std'])

Unnamed: 0_level_0,sepallength,sepallength
Unnamed: 0_level_1,mean,std
class,Unnamed: 1_level_2,Unnamed: 2_level_2
Iris-setosa,5.006,0.35249
Iris-versicolor,5.936,0.516171
Iris-virginica,6.588,0.63588


In [40]:
grouped3 = iris[['sepallength','petallength','sepalwidth','petalwidth']].groupby(iris['class'])
grouped3

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EC41BC8D00>

In [41]:
grouped3.agg('median')

Unnamed: 0_level_0,sepallength,petallength,sepalwidth,petalwidth
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.0,1.5,3.4,0.2
Iris-versicolor,5.9,4.35,2.8,1.3
Iris-virginica,6.5,5.55,3.0,2.0


In [42]:
def peak_to_peak(df):
    return df.max() - df.min() # range = max - min

In [43]:
grouped3.agg(peak_to_peak)

Unnamed: 0_level_0,sepallength,petallength,sepalwidth,petalwidth
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,1.5,0.9,2.1,0.5
Iris-versicolor,2.1,2.1,1.4,0.8
Iris-virginica,3.0,2.4,1.6,1.1


#### Task 6:
Import the finance data and group Units Sold and Sale Price by product. Aggregating data using sum.

Output:
![task.PNG](attachment:task.PNG)

Double-click <b>here</b> for the solution.

<!-- Soltuion is below:


# Solution:

# Reading the CSV file
finance = pd.read_excel("../data/Financial_Sample.xlsx")

# Grouping data
grouped_finance = finance[['Units Sold','Sale Price']].groupby(finance['Product'])

# Defining function
def total(df):
    return df.sum()

# Aggregating data using sum
grouped_finance.agg(total)

-->