# DATA 3950 Assignment 2 - Regression #

In [292]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import ExtraTreesRegressor 
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel 
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import r2_score
from sklearn.ensemble import ExtraTreesClassifier

### Make Ready to Impute and Data Loading ###

In [293]:
def makeReadyToImpute(df_):
    num_cells = df_.size
    num_del = int(num_cells * 0.1)
    mask = pd.DataFrame(True, index=df_.index, columns=df_.columns)
    cells_to_del = mask.stack().sample(n=num_del).index
    for idx in cells_to_del:
        df_.at[idx] = np.nan
    return df_
df_ = pd.read_csv("Euro_Salary.csv")

### Construct a Target ###

In [294]:
df_["Bonus"] = pd.to_numeric(df_["Yearly bonus + stocks in EUR"], downcast="float", errors="coerce")
df_["Bonus"].fillna(0, inplace=True)
df_["target"] = df_["Yearly brutto salary (without bonus and stocks) in EUR"] + df_["Bonus"]
df_ = makeReadyToImpute(df_)
df_.drop(columns={"Timestamp","Yearly brutto salary (without bonus and stocks) in EUR", "Yearly bonus + stocks in EUR", "Bonus"}, inplace=True)
df_.head()

Unnamed: 0,Age,Gender,City,Position,Total years of experience,Years of experience in Germany,Seniority level,Your main technology / programming language,Other technologies/programming languages you use often,Number of vacation days,Employment status,Сontract duration,Main language at work,Company size,Company type,target
0,26.0,Male,Munich,Software Engineer,5,3,Senior,,"Kotlin, Javascript / Typescript",,Full-time employee,Unlimited contract,English,,Product,85000.0
1,26.0,Male,Berlin,Backend Developer,7,4,Senior,Ruby,,28.0,Full-time employee,Unlimited contract,English,101-1000,Product,80000.0
2,29.0,Male,Berlin,Software Engineer,12,6,,Javascript / Typescript,"Javascript / Typescript, Docker",30.0,Self-employed (freelancer),Temporary contract,English,,Product,240000.0
3,28.0,Male,Berlin,Frontend Developer,4,1,Junior,,,24.0,Full-time employee,,,51-100,Startup,54000.0
4,37.0,Male,Berlin,Backend Developer,17,6,Senior,C# .NET,".NET, SQL, AWS, Docker",29.0,Full-time employee,Unlimited contract,English,101-1000,Product,62000.0


### Data Preparation ###

In [295]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 16 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   Age                                                     1109 non-null   float64
 1   Gender                                                  1113 non-null   object 
 2   City                                                    1107 non-null   object 
 3   Position                                                1123 non-null   object 
 4   Total years of experience                               1109 non-null   object 
 5   Years of experience in Germany                          1094 non-null   object 
 6   Seniority level                                         1113 non-null   object 
 7   Your main technology / programming language             1010 non-null   object 
 8   Other technologies/programming languag

In [296]:
print(df_.columns)

Index(['Age', 'Gender', 'City', 'Position ', 'Total years of experience',
       'Years of experience in Germany', 'Seniority level',
       'Your main technology / programming language',
       'Other technologies/programming languages you use often',
       'Number of vacation days', 'Employment status', 'Сontract duration',
       'Main language at work', 'Company size', 'Company type', 'target'],
      dtype='object')


In [297]:
# Remove outliers 
df_ = df_[df_["target"] < 250000]

In [298]:
df_ = df_.drop(columns=(["Gender", "City", "Your main technology / programming language", "Other technologies/programming languages you use often", "Number of vacation days","Сontract duration","Main language at work"]))
df_.head()

Unnamed: 0,Age,Position,Total years of experience,Years of experience in Germany,Seniority level,Employment status,Company size,Company type,target
0,26.0,Software Engineer,5,3,Senior,Full-time employee,,Product,85000.0
1,26.0,Backend Developer,7,4,Senior,Full-time employee,101-1000,Product,80000.0
2,29.0,Software Engineer,12,6,,Self-employed (freelancer),,Product,240000.0
3,28.0,Frontend Developer,4,1,Junior,Full-time employee,51-100,Startup,54000.0
4,37.0,Backend Developer,17,6,Senior,Full-time employee,101-1000,Product,62000.0


In [299]:
## Change some columns' Dtype to "Float64"

df_['Total years of experience'] = pd.to_numeric(df_['Total years of experience'], errors='coerce')
df_['Total years of experience'] = df_['Total years of experience'].astype('float64')
df_['Years of experience in Germany'] = pd.to_numeric(df_['Years of experience in Germany'], errors ='coerce') 
df_['Years of experience in Germany'] = df_['Years of experience in Germany'].astype('float64')
print(df_.dtypes)

Age                               float64
Position                           object
Total years of experience         float64
Years of experience in Germany    float64
Seniority level                    object
Employment status                  object
Company size                       object
Company type                       object
target                            float64
dtype: object


In [300]:
numeric_features_ = ["Age", "Total years of experience", "Years of experience in Germany"]
numeric_transformer_ = Pipeline( steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())])

categorical_features_ = ["Position ","Seniority level","Employment status","Company size","Company type"]
categorical_transformer_ = Pipeline( steps=[
    ("impute_mode", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore"))])

preprocessor_ = ColumnTransformer(transformers=[
    ("num", numeric_transformer_, numeric_features_),
    ("cat", categorical_transformer_, categorical_features_)], remainder="passthrough")

### Model Building without feature selection ###

In [301]:
y_ = df_["target"]
X_ = df_.drop(columns={"target"})

regressor = ExtraTreesRegressor()
pipeline_steps_ = [('pre', preprocessor_),('regressor', regressor) ]
pipe_ = Pipeline(pipeline_steps_)

X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_)

pipe_.fit(X_train_, y_train_)
print("Test score:", pipe_.score(X_test_, y_test_))

y_pred_ = pipe_.predict(X_test_)

Test score: -0.09306098579584221


In [302]:
# Calculate R2 and RMSE
y_pred_ = pipe_.predict(X_test_)
r2 = r2_score(y_test_, y_pred_)

mse = mean_squared_error(y_test_, y_pred_)
rmse = np.sqrt(mse)

print(r2)
print(rmse)

-0.09306098579584221
38236.708886238855


### Model Building with feature selection ###

In [304]:
# Select from Model-Tree Based Selection 
tree_sel = ExtraTreesClassifier().fit(X_, y_)
sfm_class = SelectFromModel(tree_sel, prefit=True)
x_new = sfm_class.transform(X_)
print(x_new.shape)

mask = sfm_class.get_support()
new_features_select = df_.columns[mask]
print(new_features_select)

# Using Recursive Feature Elimination 
est = LogisticRegression (max_iter = 10000, solver = "saga")
rfe = RFE(estimator=est, n_features_to_select=12, step=1)
rfe.fit(X_, y_)
print(rfe.ranking_)
mask2 = rfe.get_support()
new_features_rfe = df.columns[mask2]
print(new_features_rfe)

ValueError: could not convert string to float: 'Software Engineer'