In [202]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

In [203]:
df = pd.read_csv('cleaned_data.csv')
df.drop(['Unnamed: 0', 'company', 'education', 'race'], axis = 1, inplace = True)

In [204]:
numeric_features = ['yearsofexperience',
       'yearsatcompany', 'basesalary', 'stockgrantvalue', 'bonus',
       'Masters_Degree', 'Bachelors_Degree', 'Doctorate_Degree', 'Highschool',
       'Some_College', 'Race_Asian', 'Race_White', 'Race_Two_Or_More',
       'Race_Black', 'Race_Hispanic', 'in_top_15']
numeric_transformer = FunctionTransformer(np.square)

In [205]:
categorical_features = ["title", "gender", "state"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [206]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [207]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KNeighborsRegressor())]
)

In [208]:
X = df.loc[:, (df.columns != "totalyearlycompensation")]
y = df.totalyearlycompensation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [209]:
clf.fit(X_train, y_train)

In [210]:
clf.score(X_test, y_test)

0.9050458531996377

In [216]:
test = X_train.head(1)
test

Unnamed: 0,title,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,bonus,gender,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,state,in_top_15
8981,Solution Architect,14.0,8.0,155000.0,0.0,0.0,Female,0,1,0,0,0,0,1,0,0,0,WA,True


In [217]:
clf.predict(test)

array([155200.])

#### Working Function

In [223]:
df = pd.read_csv('cleaned_data.csv')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

def convert(years):
    if years == 0:
        return 'No Experience'
    elif 1 <= years <= 3:
        return 'Little Experience'
    elif 4 <= years <= 6:
        return 'Medium Experienced'
    else:
        return 'Very Experienced'

df['experience'] = df['yearsofexperience'].apply(convert)

user_input = ['state', 'title', 'race', 'education', 'in_top_15', 'gender', 'experience']
fill = df.groupby(by = user_input).mean()
fill.reset_index(inplace = True)

top_15 = list(df['company'].value_counts().head(15).index)

def predict(data):
    return clf.predict(data)[0]

def impute(state, title, race, education, company, gender, yearsofexperience):
    top_15_data = None
    if company in top_15:
        top_15_data = True
    else:
        top_15_data = False

    exp = convert(int(yearsofexperience))

    new_data = fill[(fill['state'] == state) & 
         (fill['title'] == title) & 
         (fill['race'] == race) & 
         (fill['gender'] == gender) & 
         (fill['in_top_15'] == top_15_data) & 
         (fill['education'] == education) &
         (fill['experience'] == exp)]

    data = None
    if len(new_data) == 0:
        return "Sorry, no matching data found. We are not able to make an accurate prediction."
    else:
        data = new_data.head(1)

    return predict(data)

In [232]:
row = impute('CA', 'Software Engineer', 'Asian', 'Masters', 'Other', 'Male', 0)
row

165000.0