In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing some tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [None]:
df_train = pd.read_csv('../input/job-salary-prediction/Train_rev1.zip', compression='zip', header=0, sep=',', quotechar='"')

In [None]:
df_train.head()

In [None]:
df_train.tail()

# Now Some EDA Part (Data Exploration)

In [None]:
df_train.describe()

In [None]:
df_train.info()

# Now Check missing values

In [None]:
df_train.isna().sum()

# Now Check for string label 

In [None]:
for label,content in df_train.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

# Now Check for numerical label

In [None]:
for label,content in df_train.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

# This will turn all of the string value into category values

In [None]:
for label, content in df_train.items():
    if pd.api.types.is_string_dtype(content):
        df_train[label] = content.astype("category").cat.as_ordered()

# Now Filling missing values

In [None]:
for label,content in df_train.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        df_train[label+"is_missing"]=pd.isnull(content)
        # Turn categories into numbers and add+1
        df_train[label] = pd.Categorical(content).codes+1

# Now Check Missing Values in Dataset

In [None]:
df_train.isna().sum()

# Now Data Visualization Part

In [None]:
ms = df_train["SalaryNormalized"][:10].plot.barh(figsize=(16,10))

In [None]:
df_train["SalaryNormalized"].hist()

# Now copy the train set dataset

In [None]:
df_copy = df_train.copy()

In [None]:
df_copy.head()

In [None]:
df_copy.tail()

# Now Split the data into X and y

In [None]:
X = df_copy.drop(columns=["SalaryNormalized"],axis=1)
y = df_copy["SalaryNormalized"]

# Now Call Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

# Now build Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor(n_jobs=-1)

# Now Fit The Model

In [None]:
model.fit(X_train,y_train)

# Evaluation of the model
Evaluate model using mean absolute error

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
y_pred = model.predict(X_test)

In [None]:
mae_rand_forest = mean_absolute_error(y_test,y_pred)

In [None]:
mae_rand_forest

# Now Some Hyerparameter tuning with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
np.random.seed(42)
grid = {
    "n_estimators":np.arange(10,100,10),
    "max_depth":[None,3,5,10],
    "min_samples_split":np.arange(2,20,2),
    "min_samples_leaf":np.arange(1,20,2),
    "max_features": [0.5,1,"sqrt","auto"],
    "max_samples":[10000,12000,15000,20000]
}

In [None]:
rs_model = RandomizedSearchCV(
RandomForestRegressor(n_jobs=-1,
                     random_state=42),
                    param_distributions = grid,
                     n_iter=5,
                    cv=5,
                    verbose=True)

# Now Fit RandomizedSearchCV Model

In [None]:
rs_model.fit(X_train,y_train)

In [None]:
rs_model.best_params_

# NowChoose the best performance on the model

In [None]:
y_preds_rs = rs_model.predict(X_test)

# Evaluation of this model
Evaluate model using mean absolute error

In [None]:
mae_hyp = mean_absolute_error(y_test,y_preds_rs)

In [None]:
mae_hyp,mae_rand_forest

# Make predictions

# Now Importing test data

In [None]:
df_test = pd.read_csv('../input/job-salary-prediction/Test_rev1.zip', compression='zip', header=0, sep=',', quotechar='"')

# Now Check for missing values on df_test dataset

In [None]:
df_test.isna().sum()

In [None]:
df_test.head()

# Now Check for string label

In [None]:
for label,content in df_test.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

# Now Check for numerical label

In [None]:
for label,content in df_test.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

# This will turn all of the string value into category values

In [None]:
for label, content in df_test.items():
    if pd.api.types.is_string_dtype(content):
        df_test[label] = content.astype("category").cat.as_ordered()

# Now Filling missing values on test Dataset

In [None]:
for label,content in df_test.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        df_test[label+"is_missing"]=pd.isnull(content)
        # Turn categories into numbers and add+1
        df_test[label] = pd.Categorical(content).codes+1

In [None]:
X_test.shape,y_test.shape  

# Now Reshape X_train & df_test

In [None]:
set(X_train.columns)-set(df_test.columns)

In [None]:
df_test["SalaryRaw"] = False
df_test["SalaryRawis_missing"] = False

In [None]:
X_train.shape,df_test.shape

# Now Make predictions

In [None]:
y_preds = model.predict(df_test)

# Format predictions into the same format Kaggle is after

In [None]:
df_preds = pd.DataFrame()

In [None]:
df_preds["Id"] = df_test["Id"]
df_preds["SalaryNormalized"] = y_preds

In [None]:
df_preds.head()

In [None]:
df_preds.to_csv(".//Submission.csv",index=False)

# Now Find feature importance of our best model

In [None]:
model.feature_importances_

# Helper function for plotting feature importance

In [None]:
def plot_features(columns, importances, n=20):
    df = (pd.DataFrame({"features": columns,
                        "feature_importances": importances})
          .sort_values("feature_importances", ascending=False)
          .reset_index(drop=True))
    
    # Plot the dataframe
    fig, ax = plt.subplots()
    ax.barh(df["features"][:n], df["feature_importances"][:20])
    ax.set_ylabel("Features")
    ax.set_xlabel("Feature importance")
    ax.invert_yaxis()

# Now Plot The Data

In [None]:
plot_features(X_train.columns,model.feature_importances_)

In [None]:
df_copy["SalaryRaw"].value_counts()