# Task for Today  

***

## Engineering Salary Prediction  
  
Given *data about engineering students*, let's try to predict the **salary earned** by a given student.  
  
We will use a linear regression model and XGBoost to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [None]:
data = pd.read_csv('../input/engineering-graduate-salary-prediction/Engineering_graduate_salary.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop ID column
    df = df.drop('ID', axis=1)
    
    # Binary encoding
    df['Gender'] = df['Gender'].replace({'f': 0, 'm': 1})
    
    # Date encoding
    df['DOB'] = pd.to_datetime(df['DOB'])
    df['DOB_year'] = df['DOB'].apply(lambda x: x.year)
    df['DOB_month'] = df['DOB'].apply(lambda x: x.month)
    df['DOB_day'] = df['DOB'].apply(lambda x: x.day)
    df = df.drop('DOB', axis=1)
    
    # One-hot encoding
    for column in ['10board', '12board', 'Degree', 'Specialization', 'CollegeState']:
        df = onehot_encode(df, column)
    
    # Encode and fill missing values
    df = df.replace(-1, np.NaN)
    missing_columns = [column for column in df.columns if df.isna().sum()[column] > 0]
    for column in missing_columns:
        df[column] = df[column].fillna(df[column].mean())
    
    # Split df into X and y
    y = df['Salary']
    X = df.drop('Salary', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

lin_r2 = lin_model.score(X_test, y_test)

print("Linear Regression R^2 Score: {:.5f}".format(lin_r2))

In [None]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

xgb_r2 = xgb_model.score(X_test, y_test)

print("XGBoost R^2 Score: {:.5f}".format(xgb_r2))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/aQ_8zjPWtvw