In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Reading Data

In [None]:
file_path = '../input/datacsv/data.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
df.tail(10)

### Information

In [None]:
df.info()

### Checking NAN

In [None]:
df.isna().sum()

### Checking Duplicates

In [None]:
df.duplicated().sum()
df[df.duplicated()]

### Count

In [None]:
df.count()

### Describtion

In [None]:
df.describe()

### Data Cleaning
#### 1) Drop Duplicates

In [None]:
df.drop_duplicates(inplace=True)
df[df.duplicated()].count()

#### 2) Handle Missing Values
There are 2 different ways to do that:
1. Imputation: replace missing values with Mean, Mode, Median or using ML algorithm to get the most appropriate value based on the distribution
2. Removing rows that contain empty cells
3. Dropping columns that contains empty cells

In [None]:
# Get the columns that contain missing values
nan_columns = df.columns[df.isna().any()].tolist()
nan_columns

In [None]:
# Imputation: here we'll use Median value
no_nan_df1 = df.copy()
no_nan_df1[nan_columns] = df[nan_columns].fillna(df[nan_columns].median())
no_nan_df1.isna().sum()

In [None]:
# Removing Rows
no_nan_df2 = df.dropna()
no_nan_df2.isna().sum()

In [None]:
# Dropping Columns
no_nan_df3 = df.drop(nan_columns, axis=1)
no_nan_df3.isna().sum()

### Correlation
It's a masked matrix represents just the lower triangle of Correlation Matrix for more readability

In [None]:
corr = df.corr()
corr_mask = np.ones_like(corr)
corr_mask[np.tril_indices_from(corr_mask)] = False

plt.subplots(figsize=(5,5))
sns.heatmap(corr, mask=corr_mask, 
            square=True, annot=True)
plt.show()

### Visualization

In [None]:
# Show kernel function
df.plot(subplots=True, layout=(4,1), kind='kde')

In [None]:
# Show histogram
df.plot(subplots=True, layout=(2,2), kind='hist')

In [None]:
# Show box plot
df.plot(subplots=True, layout=(2,2), kind='box')

In [None]:
"""
    Show scatter plot with `Pulse` on X-axis, `Maxpulse` on Y-axis
    and color-coded with `Calories` and sized with `Duration`
"""

df.plot.scatter(x = 'Pulse', y = 'Maxpulse', s = 'Duration', c = 'Calories', 
                colormap='Blues', title="Pulse VS Maxpulse VS Duration VS Calories")

In [None]:
# Show scatter matrix
pd.plotting.scatter_matrix(df, figsize=(10, 10), diagonal="kde");