<a href="https://www.kaggle.com/code/sakibsrizon/sf-salary-stats-eda?scriptVersionId=109477584" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# <div style="text-align:center; border-style:solid; background-color:#e6faff; padding: 20px;line-height:1; border-radius:20px;">**SF Salary Model EDA**
</div>
 <h3 style = "text-decoration-line:underline;">Preprocessing</h3> 
 <ul style = " line-style-type: circle">
    <li>Detect continuous and categorical variables.</li>
    <li>Normalize and impute data.</li>
 </ul>
<h3 style = "text-decoration-line:underline;">For every target variable in the dataset</h3> 
 <ul style = " line-style-type: circle">
    <li>Compare performance on 27 models + a TabNet model.</li>
    <li>Normalize and impute data.</li>
    <li>Save plots + CSVs of XGBoost Feature Importances.</li>
    <li>Save best performing FastAI model.</li>
 </ul>


# <p style ='text-size: 1px; text-color: black'> ref: https://www.kaggle.com/code/sakibsrizon/sf-salaries-dataset-xgboost-fastai/edit</p>

In [None]:
# Installing essential libs
!pip install lazypredict fast-tabnet fastai pandas-profiling

In [None]:
# importing basic libs
import numpy as np 
import pandas as pd 
import pandas_profiling 

# LazyPredict
import lazypredict
from lazypredict.Supervised import LazyRegressor
from lazypredict.Supervised import LazyClassifier

# Baysian Optimization
from bayes_opt import BayesianOptimization

# FastAI
from fastai.tabular.all import *
from fastai.tabular.core import *
from fast_tabnet.core import *

# Plots
import matplotlib.pyplot as plt
import seaborn as sns

# System
import os,sys,traceback

# Fit for Xgboost model
from xgboost import XGBRegressor,XGBClassifier,plot_importance
from sklearn.metrics import mean_squared_error,roc_auc_score

# Random
import random

import shutil
%matplotlib inline
plt.style.use('seaborn-bright')
print("All Lib imported")

### Definifng project Variables

In [None]:
PROJECT_NAME = 'sanfrancisco-salary-3-years'
VARIABLE_FIELS =False

# Maximum row amount 
SAMPLE_COUNT = 2000
FASTAI_LEARNING_RATE = 1e-1
AUTO_ADJUEST_LEARNING_RATE = False

# Set to true automatically infer if variables are categorical or continuous
Enable_BREAKPOINTS = True 

# When trying to declear a column a continuous variable, if it's not it'll convert to categorical variables
CONVERT_TO_CAT = False 
REGRESSOR = True 
SEP_DOLLER = False 
SEP_COMMA = False
SHUFFLE_DATA = True


In [None]:
# importing Data set 
input_dir = f'../input/{PROJECT_NAME}'
param_dir = f'/kaggle/working/{PROJECT_NAME}'
TARGET = ''
PARAM_DIR = param_dir
print(f'param_dir: {param_dir}')
if not os.path.exists(param_dir):
    os.makedirs(param_dir)
df = pd.read_csv('../input/sanfrancisco-salary-3-years/SanFrancisco_salary (csv).csv', nrows=SAMPLE_COUNT)
df.head(10)

In [None]:
# Replacing Doller sign in every columns 
if SEP_DOLLER:
    for  col in df.columns:
        if '$' in df[col].to_string():
            df[col + '_no_dollar'] = df[col].str.replace('$', '').str.replace(',', '')
            try:
                df[col + '_no_dollar'] = df[col + '_no_dollar'].apply(pd.to_numeric, errors = 'coerce').dropna()
            except:
                print(f'{col} can not be converted to float!')
                
if SEP_COMMA:
    for col in df.columns:
        if '%' in df[col].to_string() or ',' in df[col].to_string():
            df[col + '_processed'] = df[col].str/replace('%', '').str.replace(',','')
#             Trying to convert this new column  to a numeric type 
            try:
                df[col + '_processed'] = df[col + '_processed'].apply(pd.to_numeric, errors = 'coerce').dropna()
            except:
                print(f'{col} can not be converted to a float!')
                
# Shwoing Modified dataframe
df.head(10)

In [None]:
df.isna().sum()

In [None]:
# df.profile_report()

In [None]:
sns.heatmap(df.corr())

In [None]:
df.head().style.background_gradient(cmap = "inferno")

In [None]:
df.describe().T.style.background_gradient(cmap = "viridis")

In [None]:
# Shwoing columns
df.columns

In [None]:
# removing the unnecessary features 
del(df['Unnamed: 0'])
del(df['EmployeeName'])
del(df['Id'])
df.head(10)

In [None]:
target = ''
target_str = ''
target = []
# Lopping through every possible target columns 
for i in range(len(df.columns) - 1,0,-1):
    try:
        df[df.columns[i]] = df[df.columns[i]].apply(pd.to_numeric, errors = 'coerce').dropna()
        target = df.columns[i]
        target_str = target.replace('/','-')
    except:
        continue
    print(f'Target Variable : {target}')
    
#     Creating project configuring file 
    if not os.path.exists(param_dir):
        os.makedir(PARAM_DIR)
    if not os.path.exists(f'{PARAM_DIR}/cats.txt'):
        with open(f'{PARAM_DIR}/cats.txt', 'w') as f:
            f.write('')
    if not os.path.exists(f'{PARAM_DIR}/conts.txt'):
        with open(f'{PARAM_DIR}/conts.txt', 'w') as f:
            f.write('')
    if not os.path.exists(f'{PARAM_DIR}/cols_to_delete.txt'):
        with open(f'{PARAM_DIR}/cols_to_delete.txt', 'w') as f:
            f.write('')
    df = df.drop_duplicates()
    if SHUFFLE_DATA:
        df = df.sample(face = 1).reset_index(drop = True)
    
    for n in df:
        if pd.api.types.is_bool_dtype(df[n]):
            df[n] = df[n].astype('unit8')
    with open(f'{PARAM_DIR/cols_to_delete.txt}', 'r') as f:
        cols_to_delete = f.read().splitlines()
    for col in cols_to_delete:
        try: 
            del(df[col])
        except:
            pass
    try:
        df = df.fillna(0)
    except:
        pass
    
df = df_shrink(df)

# Auto detect Categorical and continuous varibales 
likely_cat = 
        