# Exploratory Data using Python

## Initial look

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Load data
unemployment = pd.read_csv('clean_unemployment.csv')
books = pd.read_csv('clean_books.csv')
salaries = pd.read_csv('sd_salaries_clean.csv')
planes = pd.read_csv('planes.csv')
# Initial look
unemployment.head()
unemployment.info()

# A closer look at categorical columns
unemployment.value_counts('catg_col')
# numerical columns
unemployment.describe()
# hist plot for numerical columns
sns.histplot(data=books,x="rating",binwidth=1)

## Data Validation

In [None]:
# Data type
# can change the type to int/str/float/dict/list/bool
df['column'] = df['column'].astype('int') 

# Validate categorical data
df['catg_data'].isin(['first_data','second_data'])
#* The data is not in
~df['catg_data'].isin(['first_data','second_data'])

# Validating Numeric Data
df.select_types("number")
df['num_col'].min()
df['num_col'].max()



## Missing Data
### 1. Detect Missing Data

In [None]:
#Detecting Missing Value
df.isna() # Return each columns and rows
#Detecting any Missing value
df.isna().any() # Return T/F of the summary of column
# Counting missing value
df.isna().sum()
# Plotting missing values
df.isna().sum().plot(kind = 'bar')

### 2. Dealing Missing Data

- Drop missing values
    5% or less of total values
- Impute mean, median, mode
    Depends on distribution and context
- Impute by sub-group 
     Different experience levels have different median salary

In [None]:
# Checking for missing values
df.isna().sum()

# If the NA is only a small amount and doesn't really matter
df.dropna()
# Replace with 0
df.fillna(0)

# Find the columns that have the missing value <=0.5 total value
threshold = len(salaries) * 0.05
cols_to_drop = df.columns[salaries.isna().sum() <= threshold]
# Drop the NA in these columns
salaries.dropna(subset=cols_to_drop, inplace=True)

# Impute a summary statistics
cols_with_missing_values = df.columns[df.isna().sum()> 0]
for col in cols_with_missing_values[:-1]:
    df[col].fillna(df[col].mode()[0])

# Impute by sub-group
salaries_dict = salaries.groupby("Experience")["Salary_UsD"].median().to_dict()
# Output: {'Entry':!55380.0，'Executive':135439.0，'Mid':74173.5，'Senior':128903.0}
salaries["Salary UsD"] = salaries["Salary usp"].fillna(salaries["Experience"].map(salaries_dict))


## Analyzing Categorical Data

In [None]:
# Preview Categorical data
salaries.select_dtypes('object').head()
# Count data under column
salaries['Designation'].value_counts()
# Check number of unique value
salaries['Designation'].nunique()

### 1. Extracting value from categories

In [None]:
# Search a column for a specific string or multiple strings
salaries["Designation"].str.contains("Scientist")
# Finding multiple phrases in strings - using ``|``
salaries["Designation"].str.contains("Machine Learning|AI")
# Any that start with Data - using ``^``
salaries["Designation"].str.contains("^Data")

# Create a category column based on string conditions
job_categories =["Data Science","Data Analytics","Data Engineering",
                  "Machine Learning""Consultant","Managerial"]

# Create keywords for each job title
data_science ="Data Scientist|NLP"
data_analyst ="Analyst|Analytics"
data_engineer = "Data Engineer|ETL Architect|Infrastructure"
ml_engineer ="Machine Learning|ML|Big Data|AI"
manager ="Manager|Head|Director|Lead|Principal|Staff"
consultant="Consultant|Freelance"

# Create condition for each job title
conditions = [
    (salaries["Designation"].str.contains(data_science)),
    (salaries["Designation"].str.contains(data_analyst)),
    (salaries["Designation"].str.contains(data_engineer)),
    (salaries["Designation"].str.contains(ml_engineer)),
    (salaries["Designation"].str.contains(manager))
]

# Create Category column
salaries["Job_Category"]=np.select(
    conditions,
    job_categories,
    default="Other")

#  1