# Exploratory Data using Python

## Initial look

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Load data
unemployment = pd.read_csv('clean_unemployment.csv')
books = pd.read_csv('clean_books.csv')
salaries = pd.read_csv('sd_salaries_clean.csv')
planes = pd.read_csv('planes.csv')
# Initial look
unemployment.head()
unemployment.info()

# A closer look at categorical columns
unemployment.value_counts('catg_col')
# numerical columns
unemployment.describe()
# hist plot for numerical columns
sns.histplot(data=books,x="rating",binwidth=1)

## Data Validation

In [None]:
# Data type
# can change the type to int/str/float/dict/list/bool
df['column'] = df['column'].astype('int') 

# Validate categorical data
df['catg_data'].isin(['first_data','second_data'])
#* The data is not in
~df['catg_data'].isin(['first_data','second_data'])

# Validating Numeric Data
df.select_types("number")
df['num_col'].min()
df['num_col'].max()



## Missing Data
### 1. Detect Missing Data

In [None]:
#Detecting Missing Value
df.isna() # Return each columns and rows
#Detecting any Missing value
df.isna().any() # Return T/F of the summary of column
# Counting missing value
df.isna().sum()
# Plotting missing values
df.isna().sum().plot(kind = 'bar')

### 2. Dealing Missing Data

- Drop missing values
    5% or less of total values
- Impute mean, median, mode
    Depends on distribution and context
- Impute by sub-group 
     Different experience levels have different median salary

In [None]:
# Checking for missing values
df.isna().sum()

# If the NA is only a small amount and doesn't really matter
df.dropna()
# Replace with 0
df.fillna(0)

# Find the columns that have the missing value <=0.5 total value
threshold = len(salaries) * 0.05
cols_to_drop = df.columns[salaries.isna().sum() <= threshold]
# Drop the NA in these columns
salaries.dropna(subset=cols_to_drop, inplace=True)

# Impute a summary statistics
cols_with_missing_values = df.columns[df.isna().sum()> 0]
for col in cols_with_missing_values[:-1]:
    df[col].fillna(df[col].mode()[0])

# Impute by sub-group
salaries_dict = salaries.groupby("Experience")["Salary_UsD"].median().to_dict()
# Output: {'Entry':!55380.0，'Executive':135439.0，'Mid':74173.5，'Senior':128903.0}
salaries["Salary UsD"] = salaries["Salary usp"].fillna(salaries["Experience"].map(salaries_dict))


## Analyzing Categorical Data

In [None]:
# Preview Categorical data
salaries.select_dtypes('object').head()
# Count data under column
salaries['Designation'].value_counts()
# Check number of unique value
salaries['Designation'].nunique()

### 1. Extracting value from categories

In [None]:
# Search a column for a specific string or multiple strings
salaries["Designation"].str.contains("Scientist")
# Finding multiple phrases in strings - using ``|``
salaries["Designation"].str.contains("Machine Learning|AI")
# Any that start with Data - using ``^``
salaries["Designation"].str.contains("^Data")

# Create a category column based on string conditions
job_categories =["Data Science","Data Analytics","Data Engineering",
                  "Machine Learning""Consultant","Managerial"]

# Create keywords for each job title
data_science ="Data Scientist|NLP"
data_analyst ="Analyst|Analytics"
data_engineer = "Data Engineer|ETL Architect|Infrastructure"
ml_engineer ="Machine Learning|ML|Big Data|AI"
manager ="Manager|Head|Director|Lead|Principal|Staff"
consultant="Consultant|Freelance"

# Create condition for each job title
conditions = [
    (salaries["Designation"].str.contains(data_science)),
    (salaries["Designation"].str.contains(data_analyst)),
    (salaries["Designation"].str.contains(data_engineer)),
    (salaries["Designation"].str.contains(ml_engineer)),
    (salaries["Designation"].str.contains(manager))
]

# Create Category column
salaries["Job_Category"]=np.select(
    conditions,
    job_categories,
    default="Other")

## Analyzing Numeric Data
Converting string to number 

In [None]:
# Replace comma
salaries['salary_in_rupees'] = salaries['salary_in_rupees'].srt.replace(","," ")
# Convert string to number
salaries['salary_in_rupees'] = salaries['salary_in_rupees'].astype(float)
# Convert base on currency
salaries['salary_in_USD'] = salaries['salary_in_rupees']*0.012
# Summary statistics
salaries.groupby('company_size')['salary_in_USD'].mean()
# Add statistics in a DataFrame
salaries["std_dev"] = salaries.groupby("Experience")["Salary_USD"].transform(lambda x: x.std())
#* transform: Operates on each group independently and applies the transformation to return a series that has the same index as the original DataFrame.
#* lambda x: x.std():  computes the standard deviation of the Salary_USD values (x) for each group.
#* x represents the subset of Salary_USD for each Experience group

# Display the std on the group
planes[["Airline","airline_median_duration"]].value_counts()

## Handling Outliers
Why do these outliers exist?
Is the data accurate?

In [None]:
# Spot the outliers
salaries['column1'].describe()

# Identify thresholds
#75th
seventy_fifth =salaries["Salary_USD"].quantile(0.75)
#25th
twenty_fifth =salaries["Salary_USD"].quantile(0.25)
# iqr
salaries_iqr = seventy_fifth - twenty_fifth
#  Upper  thresholds   
upper = seventy_fifth + (1.5*salaries_iqr)
#  Lower  thresholds
lower =    twenty_fifth -  (1.5*salaries_iqr)
#! Check if the thresholds is within a meaningful range

# Subsetting the data, select the outliers
salaries[(salaries["Salary_USD"]<lower)|(salaries["Salary_UsD"]> upper)][["Experience","Employee_Location","Salary_USD"]]

# Drop outliers
no_outlier = salaries[(salaries["Salary_USD"]>lower)&(salaries["Salary_UsD"]< upper)]

## Relationship in Data
### Dealing with time data

In [None]:
# change data type while read the file
divorce = pd.read_csv("divorce.csv", parse_dates=['marriage_date'])

# Convert data to date
divorce['marriage_date'] = pd.to_datetime(divorce['marriage_date'])

# combine and convert to date type
divorce['marriage_date'] = pd.to_datetime(divorce[['month','day','year']])

# extract full date
divorce['marriage_month'] = divorce['marriage_date'].dt.month

### Numeric Correlation

In [None]:
# Sea the corr of a dataset
divorce.corr()
# Visualize the corr in heatmap
sns.heatmap(divorce.corr(),annot=True)


In [None]:
# Visualizing Correlation
sns.scatterplot(x="sleep_total", y="sleep_rem", data=msleep) 
# Adding a trendline
sns.lmplot(x="sleep_total", y="sleep_rem", data=msleep, ci=None)
# Computing correlation
msleep['sleep_total'].corr(msleep['sleep_rem'])

# Visualize in pairplot
sns.pairplot(data = divorce)
# Visualize only the concerned variables
sns.pairplot(data = divorce, var = ['income_man','income_woman','marriage_duration'])

### Categorical Correlation
**Kernel Density Estimate (KDE) plots:**
creates a smooth curve that represents the data distribution, unlike histograms that use discrete bins

In [None]:
# KDE
sns.kdeplot(data=divorce, x="marriage_duration", hue="education_man")
# Show data start from 0 in x-axis
sns.kdeplot(data=divorce, x="marriage_duration", hue="education_man",cut = 0)
# Cumulative KDE
sns.kdeplot(data=divorce, x="marriage_duration", hue="education_man",cut = 0,cumulative=True)


## Class imbalance
Class imbalance occurs in datasets where the number of observations for one class is significantly larger or smaller than the other classes.

In [None]:
# class relative frequency 
salaries['Job_Category'].value_counts(normalize=True) # Normalization: standardizing the values as proportions
# Cross-tabulation: Identify how observations occur in combination
pd.crosstab(planes["Source"],planes["Destination"]) #'Source' is the index column and 'Destination' is the select column
# Aggregated values with crosstab
pd.crosstab(planes["Source"],planes["Destination"],values=planes["Price"],aggfunc="median")

## Create new categorical columns

In [None]:
# Define threshold
twenty_fifth = planes["Price"].quantile(0.25)
median = planes["Price"].median()
seventy_fifth = planes["Price"].quantile(0.75)
maximum = planes["Price"].max()
# Set labels and bins
labels =["Economy","First Class","Premium Economy","Business Class"] 
bins=[0,twenty_fifth, median,seventy_fifth, maximum]
# Map number column to the category label
planes["Price_Category"]= pd.cut(planes["Price"],Labels=labels,bins=bins)