## Job Salary Prediction with TensorFlow
Now I am going to build a Model to predict Salary using TensorFlow. For some kind of text information with limited categories, I will treat it as category feature. For some kinds of long text information such as job description, or text information with too many categories such as job title, company, location, I will process it using NLP technique.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import sys
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gc
from sklearn.model_selection import train_test_split

## Import datasets

In [None]:
!unzip /kaggle/input/job-salary-prediction/Train_rev1.zip

In [None]:
!unzip /kaggle/input/job-salary-prediction/Test_rev1.zip

In [None]:
train = pd.read_csv("Train_rev1.csv")
train.head()

In [None]:
test = pd.read_csv("Test_rev1.csv")
test.head()

## EDA &preprocessing

In [None]:
train.head(30)

## Title

In [None]:
train["Title"] = train["Title"].apply(lambda item: str(item).lower())

In [None]:
len(train["Title"].unique())

### Popular titles

In [None]:
train["Title"].value_counts()[:30]

### Locations

In [None]:
train["LocationNormalized"] = train["LocationNormalized"].apply(lambda item: item.lower())

In [None]:
len(train["LocationNormalized"].unique())

In [None]:
train["LocationNormalized"].value_counts()[:50]

### Contract Time

In [None]:
train["ContractTime"].value_counts()

### Contract Type

In [None]:
train["ContractType"].value_counts().plot(kind="bar")

### Company

In [None]:
train["Company"] = train["Company"].apply(lambda item: str(item).lower())

In [None]:
len(train["Company"].unique())

In [None]:
train["Company"].value_counts()[:30]

### Category

In [None]:
train["Category"].value_counts().plot(kind='bar')

### Salary

In [None]:
train["SalaryNormalized"].hist()

In [None]:
train["SalaryNormalized"].describe()

In [None]:
train_test = pd.concat([train, test], axis=0)
train_test.head()

In [None]:
sys.setrecursionlimit(100000)
begin = time.time()
full_texts = []
for i in range(len(train_test)):
    item = train_test.iloc[i]
    full_text = "title %s company %s location %s description %s"%(str(item["Title"]), str(item["Company"]), str(item["LocationNormalized"]), str(item["FullDescription"]))
    full_texts.append(full_text)
print("Elapsed time: %.2fs"%(time.time() - begin))

In [None]:
train_test["full_text"] = full_texts

In [None]:
del full_texts
gc.collect()

In [None]:
def preprocess(df):
    df['full_text'] = df['full_text'].str.replace(r'[^\w\s]+', '')
    df['full_text'] = df['full_text'].str.lower()
    df["full_text_tokens"] = df["full_text"].apply(lambda item: item.split(" "))
    df["full_text_sequence_length"] = df["full_text_tokens"].apply(lambda item: len(item))
    return df

In [None]:
%%time
train_test = preprocess(train_test)
train_test.head()

In [None]:
train_samples = len(train)
test_samples = len(test)

In [None]:
del train
del test
gc.collect()

In [None]:
train_test["full_text_sequence_length"].describe()

## Handle Categorical Features

In [None]:
categorical_columns = ["ContractType", "ContractTime", "Category"]
for category in categorical_columns:
    train_test[category].replace(np.NAN, "unknown")

In [None]:
X_categorical = pd.get_dummies(train_test[categorical_columns])
X_categorical.head()

### Analysis of word counts

In [None]:
%%time
from collections import defaultdict
word_count = defaultdict(int)
for tokens in train_test["full_text_tokens"]:
    for token in tokens:
       word_count[token] += 1

### Total Number of words

In [None]:
word_count_df = pd.DataFrame({"key": word_count.keys(), "count": word_count.values()})
word_count_df.head()

In [None]:
len(word_count_df)

In [None]:
word_count_df.sort_values(by="count", inplace=True, ascending=False)
word_count_df.head(30)

In [None]:
len(word_count_df[word_count_df["count"] <= 3])

In [None]:
X_text = train_test["full_text"]
train_categorical = X_categorical.iloc[0:train_samples]
test_categorical = X_categorical.iloc[train_samples:]
train_text = X_text.iloc[0:train_samples]
test_text = X_text.iloc[train_samples:]
train_label = train_test.iloc[0:train_samples]["SalaryNormalized"]
del train_test
gc.collect()

### Train validation split

In [None]:
x_train_categorical, x_valid_categorical, x_train_text, x_valid_text, y_train_label, y_valid_label  = train_test_split(train_categorical, train_text, train_label, test_size=0.2, random_state=42)

## Make Tensorflow dataset

In [None]:
def preprocess_test(categorical, text):
    return (categorical, text), 0
def make_dataset(categorical, text, label=None, batch_size = 1024, mode="train"):
    if mode == "test":
        ds = tf.data.Dataset.from_tensor_slices((categorical, text)).map(preprocess_test)
    else:
        ds = tf.data.Dataset.from_tensor_slices(((categorical, text), label))
    if mode == "train":
        ds = ds.shuffle(1024)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
train_ds = make_dataset(x_train_categorical, x_train_text, y_train_label)
print(train_ds)
valid_ds = make_dataset(x_valid_categorical, x_valid_text, y_valid_label, mode="valid")
print(valid_ds)
test_ds = make_dataset(test_categorical, test_text, mode="test")
print(test_ds)

In [None]:
del x_train_categorical
del x_train_text
del y_train_label
del x_valid_categorical
del x_valid_text
del y_valid_label
gc.collect()

In [None]:
for item in train_ds.take(1):
    print(item)

## Modeling

In [None]:
class Config:

    vocab_size = 30000
    
    sequence_length = 256 # Length of sequence

    random_state = 42
    
config = Config()

## Text Vectorization

In [None]:
vectorizer = layers.TextVectorization(
    max_tokens=config.vocab_size, 
    output_sequence_length=config.sequence_length
)

In [None]:
%%time
with tf.device("CPU"):
    vectorizer.adapt(X_text)

In [None]:
del X_text
gc.collect()

In [None]:
def get_model():
    text_model = keras.Sequential([
        keras.Input(shape=(None, ), dtype="string"),
        vectorizer,
        layers.Embedding(config.vocab_size, 128, input_length=config.sequence_length, mask_zero=True),
        layers.Conv1D(filters=32, kernel_size=3, activation="relu"),
        layers.MaxPooling1D(),
        layers.Conv1D(filters=64, kernel_size=3, activation="relu"),
        layers.MaxPooling1D(),
        layers.Conv1D(filters=128, kernel_size=3, activation="relu"),
        layers.MaxPooling1D(),
        layers.GlobalAveragePooling1D(),
        layers.Dense(32, kernel_regularizer='l2'),
    ])
    categorical_model = keras.Sequential([
        keras.Input(shape=(33, ), dtype=tf.int32),
        layers.Dense(128, activation="relu", kernel_regularizer='l2'),
        layers.Dense(64, activation="relu", kernel_regularizer='l2'),
        layers.Dense(32, activation="relu", kernel_regularizer='l2'),
    ])
    x = keras.layers.Concatenate()([text_model.output, categorical_model.output])
    x = keras.layers.Dense(32, activation="relu")(x)
    output = keras.layers.Dense(1, activation="relu")(x)
    model = keras.Model(inputs=[categorical_model.input, text_model.input], outputs=[output])
    return model, text_model, categorical_model

In [None]:
model, text_model, categorical_model = get_model()

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
text_model.summary()

In [None]:
tf.keras.utils.plot_model(text_model, show_shapes=True)

In [None]:
categorical_model.summary()

In [None]:
tf.keras.utils.plot_model(categorical_model, show_shapes=True)

## Model Training

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, monitor="val_loss")
checkpoint = tf.keras.callbacks.ModelCheckpoint("model.tf", monitor="val_loss", save_best_only=True)
model.compile(loss="mse", optimizer="adam", metrics=["mae"])
model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint, early_stopping])

## Submission

In [None]:
test = pd.read_csv("../input/job-salary-prediction/test.csv")
y_pred = model.predict(test_ds)
test["SalaryNormalized"] = y_pred
test.to_csv("submission.csv", index=False)
test.head()

## 