Import all the relevant packages.

In [1]:
import pandas as pd
import category_encoders as ce
import lightgbm as lgb
import numpy as np

The <b>dropIrrelevantColumns</b> function drops following columns from the Datasets provided as an argument
<ul>
    <li>Wears Glasses</li>
    <li>Hair Color</li>
    <li>Instances</li>
</ul>

In [3]:
def dropIrrelevantColumns(data) :
    data = data.drop('Wears Glasses', axis = 1)
    data = data.drop('Hair Color', axis = 1)
    data = data.drop('Instance', axis = 1)
    return data

<p>The <b>preprocessData</b> function is the most important function. It performs all the necessary imnputations and transformations needed to clean and preprocess the data.</p>
The function takes 2 arguments:
<ol>
    <li>Training Dataset</li>
    <li>Test Dataset</li>
</ol>
The function returns 3 values:
<ol>
    <li>Dataframe containing independent variables of training data</li>
    <li>Dataframe containing dependent variable</li>
    <li>Dataframe containing independent variables of test data</li>
</ol>
Following are the steps performed:
<ol>
    <li>Split the dataset into Independent and Dependent Variables</li>
    <li>Add a new column to both datasets named <b>train</b>. Set the value as 1 for training dataset and set the value as 0 for test dataset. This variable will identify which entries belong to which dataset.</li>
    <li>Combine the training and testing datasets. This is done because there are a few values in Test dataset, which do not belong in training dataset.</li>
    <li>As part of preprocessing, following operations are performed.
        <ol>
            <li>Fill <i>NaN</i> in <b>Gender</b> as <i>unknown</i></li>
            <li>Fill <i>NaN</i> in <b>University Degree</b> as <i>unknown</i></li>
            <li>Fill <i>NaN</i> in <b>Profession</b> as <i>unknown</i></li>
            <li>Fill <i>NaN</i> in <b>Country</b> as <i>unknown</i></li>
            <li>Fill <i>NaN</i> in <b>Age</b> with the <i>median value</i></li>
            <li>Fill <i>NaN</i> in <b>Year of Record</b> with the <i>median value</i></li>
            <li>Fill <i>NaN</i> in <b>Body Height [cm]</b> with the <i>mean value</i></li>
            <li>Fill <i>NaN</i> in <b>Work Experience in Current Job [years]</b> with the <i>mean value</i></li>
            <li>Fill <i>NaN</i> in <b>Satisfation with employer</b> as <i>unknown</i></li>
            <li>Replace <i>numeric 0</i> in <b>Housing Situation</b> with <i>'zero'</i></li>
            <li>Split the dataset back into Training and Testing datasets</li>
        </ol>
</ol>

In [None]:
def preprocessData(data, data_test) :
    X = pd.DataFrame(data.iloc[:, :-1])
    X_test = pd.DataFrame(data_test.iloc[:, :-1])
    Y = pd.Series(data['Total Yearly Income [EUR]'])
    X['train'] = 1
    X_test['train'] = 0
    cmb = pd.concat([X, X_test])
    del X
    del X_test
    cmb['Yearly Income in addition to Salary (e.g. Rental Income)'] = cmb.apply(
            lambda row: float(row['Yearly Income in addition to Salary (e.g. Rental Income)'].split()[0]), axis=1
            )
    cmb['Work Experience in Current Job [years]'] = cmb['Work Experience in Current Job [years]'].replace('#NUM!', np.nan)   
    
    cmb['Gender'] = cmb['Gender'].fillna('unknown')
    cmb['University Degree'] = cmb['University Degree'].fillna('unknown')
    cmb['Profession'].fillna('unknown', inplace=True)
    cmb['Country'].fillna('unknown', inplace=True)
    cmb['Age'].fillna(cmb['Age'].median(), inplace=True)
    cmb['Year of Record'].fillna(cmb['Year of Record'].median(), inplace=True)
    cmb['Body Height [cm]'].fillna(cmb['Body Height [cm]'].mean(), inplace=True)
    cmb['Work Experience in Current Job [years]'] = pd.to_numeric(cmb['Work Experience in Current Job [years]'])
    cmb['Work Experience in Current Job [years]'].fillna(cmb['Work Experience in Current Job [years]'].mean(), inplace=True)
    cmb['Satisfation with employer'].fillna('unknown', inplace=True)
    cmb['Housing Situation'].replace(0, 'zero')
    X = cmb[cmb['train'] == 1]
    X_test = cmb[cmb['train'] == 0]
    del cmb
    X = X.drop('train', axis=1)
    X_test = X_test.drop('train', axis=1)
    
    return (X, Y, X_test)


Function calls to <b>dropIrrelevantColumns</b> and <b>preprocessData</b>

In [None]:
data = pd.read_csv('data.csv')
data_test = pd.read_csv('data_test.csv')
data = dropIrrelevantColumns(data)
data_test = dropIrrelevantColumns(data_test)
X , Y , X_test = preprocessData(data, data_test)

Target encode the datasets

In [None]:
cat_Col_names = ['Gender', 'Country', 'Profession', 'University Degree', 'Housing Situation', 'Satisfation with employer', 'Hair Color']
te = ce.TargetEncoder(verbose=2, cols = cat_Col_names)
X = te.fit_transform(X, Y)
X_test = te.transform(X_test)

Define the hyperparameters for lightgbm

In [None]:
params = {
          'max_depth' : 20,
          'learning_rate' : 0.003,
          "boosting" : "gbdt",
          "verbosity" : 2,
          "num_leaves" : 150,
          "n_jobs" : 12
         }

Once the preprocessing is done, next step is to fit the model to the regressor. Below is the code for prediction using LightGBM with 135000 iterations.

In [None]:
train_data = lgb.Dataset(X , label = Y)
l = lgb.train(params, train_data, 135000, verbose_eval=1000)
Y_pred = l.predict(X_test)

In [None]:
del X
del data
del data_test
del Y
del X_test

Y_pred = np.array(Y_pred)
with open("pred_team68.csv", "w") as file:
    for i in np.array(Y_pred) :
        file.write(str(i) + "\n")